tutanota/test/tests/api/worker/utils/spamClassification/TfIdfVectorizerTest.ts

import o from "@tutao/otest"
import { DynamicTfVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/DynamicTfVectorizer"

o.spec("DynamicTfVectorizer", () => {
	const tokenize = (text: string): string[] =>
		text
			.toLowerCase()
			.split(/\s+/)
			.map((t) => t.replace(/[^a-z0-9-]/gi, "")) // remove punctuation
			.filter((t) => t.length > 1)

	const rawDocuments = [
		"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
		"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
		"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
		"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
		"Millions of people choose Tuta to protect their personal and professional communication.",
	]

	const tokenizedDocuments = rawDocuments.map(tokenize)

	o("constructor throws if docIds and documents mismatch", () => {
		// o(() => new DynamicTfVectorizer(["doc1"], [["token1"], ["token2"]])).throws(Error)
	})

	// o("builds correct vocabulary with filtered tokens", () => {
	// 	const vectorizer = new DynamicTfVectorizer()
	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
	// 	o(vectorizer.vocabulary.includes("tuta")).equals(true)
	// 	o(vectorizer.vocabulary.includes("email")).equals(true)
	// 	o(vectorizer.vocabulary.includes("a")).equals(false)
	// })

	// o("vectorize returns correct TF vector", () => {
	// 	const vectorizer = new DynamicTfVectorizer()
	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
	// 	const tokens = ["email", "encryption"]
	// 	const vector = vectorizer.vectorize(tokens)
	// 	o(vector.length).equals(vectorizer.featureVectorDimension)
	//
	// 	const emailIndex = vectorizer.vocabulary.includes("email")!
	// 	const encryptionIndex = vectorizer.vocabulary.includes("encryption")!
	// 	o(emailIndex).equals(true)
	// 	o(encryptionIndex).equals(true)
	// })

	// o("transform returns correct tensor shape", () => {
	// 	const vectorizer = new DynamicTfVectorizer()
	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
	// 	const inputTokens = [
	// 		["privacy", "encryption"],
	// 		["user", "data"],
	// 	]
	// 	const vector = vectorizer.transform(inputTokens)
	//
	// 	o(vector.length).equals(2)
	// 	o(vector[0].length).equals(vectorizer.featureVectorDimension)
	//
	// 	const allZeros = Array.from(vector.flat()).every((v) => v === 0)
	// 	o(allZeros).equals(false)
	// })

	// o("adds unknown words to vocabulary when still enough space", () => {
	// 	const vectorizer = new DynamicTfVectorizer()
	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
	// 	const tokens = ["hannover", "munich"]
	// 	const vector = vectorizer.vectorize(tokens)
	// 	const nonZero = vector.some((val) => val > 0)
	// 	o(nonZero).equals(true)
	// })
})
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de> 2025-10-14 12:32:17 +02:00			`import o from "@tutao/otest"`
			`import { DynamicTfVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/DynamicTfVectorizer"`

			`o.spec("DynamicTfVectorizer", () => {`
			`const tokenize = (text: string): string[] =>`
			`text`
			`.toLowerCase()`
			`.split(/\s+/)`
			`.map((t) => t.replace(/[^a-z0-9-]/gi, "")) // remove punctuation`
			`.filter((t) => t.length > 1)`

			`const rawDocuments = [`
			`"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",`
			`"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",`
			`"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",`
			`"Unlike traditional email providers, Tuta never collects user data or scans your messages.",`
			`"Millions of people choose Tuta to protect their personal and professional communication.",`
			`]`

			`const tokenizedDocuments = rawDocuments.map(tokenize)`

			`o("constructor throws if docIds and documents mismatch", () => {`
			`// o(() => new DynamicTfVectorizer(["doc1"], [["token1"], ["token2"]])).throws(Error)`
			`})`

			`// o("builds correct vocabulary with filtered tokens", () => {`
			`// const vectorizer = new DynamicTfVectorizer()`
			`// vectorizer.initializeVocabulary(tokenizedDocuments)`
			`// o(vectorizer.vocabulary.includes("tuta")).equals(true)`
			`// o(vectorizer.vocabulary.includes("email")).equals(true)`
			`// o(vectorizer.vocabulary.includes("a")).equals(false)`
			`// })`

			`// o("vectorize returns correct TF vector", () => {`
			`// const vectorizer = new DynamicTfVectorizer()`
			`// vectorizer.initializeVocabulary(tokenizedDocuments)`
			`// const tokens = ["email", "encryption"]`
			`// const vector = vectorizer.vectorize(tokens)`
			`// o(vector.length).equals(vectorizer.featureVectorDimension)`
			`//`
			`// const emailIndex = vectorizer.vocabulary.includes("email")!`
			`// const encryptionIndex = vectorizer.vocabulary.includes("encryption")!`
			`// o(emailIndex).equals(true)`
			`// o(encryptionIndex).equals(true)`
			`// })`

			`// o("transform returns correct tensor shape", () => {`
			`// const vectorizer = new DynamicTfVectorizer()`
			`// vectorizer.initializeVocabulary(tokenizedDocuments)`
			`// const inputTokens = [`
			`// ["privacy", "encryption"],`
			`// ["user", "data"],`
			`// ]`
			`// const vector = vectorizer.transform(inputTokens)`
			`//`
			`// o(vector.length).equals(2)`
			`// o(vector[0].length).equals(vectorizer.featureVectorDimension)`
			`//`
			`// const allZeros = Array.from(vector.flat()).every((v) => v === 0)`
			`// o(allZeros).equals(false)`
			`// })`

			`// o("adds unknown words to vocabulary when still enough space", () => {`
			`// const vectorizer = new DynamicTfVectorizer()`
			`// vectorizer.initializeVocabulary(tokenizedDocuments)`
			`// const tokens = ["hannover", "munich"]`
			`// const vector = vectorizer.vectorize(tokens)`
			`// const nonZero = vector.some((val) => val > 0)`
			`// o(nonZero).equals(true)`
			`// })`
			`})`