tutanota/test/tests/api/worker/utils/spamClassification/TfIdfVectorizerTest.ts
das fd22294a18
[antispam] Add client-side local spam filtering
Implement a local machine learning model for client-side spam filtering.
The local model is implemented using tensorflow "LayersModel" to train
separate models in all available mailboxes, resulting in one model
per ownerGroup (i.e. mailbox).

Initially, the training data is aggregated from the last 30 days of
received mails, and the data is stored in a separate offline database
table named spam_classification_training_data. The trained model is
stored in the table spam_classification_model. The initial training
starts after indexing, with periodic training happening
every 30 minutes and on each subsequent login.

The model will predict on incoming mails once we have received the
entity event for said mail, moving it to either inbox or spam folder.
When users move mails, we update the training data labels accordingly,
by adjusting the isSpam classification and isSpamConfidence values in
the offline database. The MoveMailService now contains a moveReason,
which indicates that the mail has been moved by our spam filter.

Client-side spam filtering can be activated using the
SpamClientClassification feature flag, and is for now only
available on the desktop client.

Co-authored-by: sug <sug@tutao.de>
Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com>
Co-authored-by: abp <abp@tutao.de>
Co-authored-by: map <mpfau@users.noreply.github.com>
Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com>
Co-authored-by: frm <frm@tutao.de>
Co-authored-by: das <das@tutao.de>
Co-authored-by: nif <nif@tutao.de>
Co-authored-by: amm <amm@tutao.de>
2025-10-22 09:25:20 +02:00

71 lines
2.8 KiB
TypeScript

import o from "@tutao/otest"
import { DynamicTfVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/DynamicTfVectorizer"
o.spec("DynamicTfVectorizer", () => {
const tokenize = (text: string): string[] =>
text
.toLowerCase()
.split(/\s+/)
.map((t) => t.replace(/[^a-z0-9-]/gi, "")) // remove punctuation
.filter((t) => t.length > 1)
const rawDocuments = [
"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
"Millions of people choose Tuta to protect their personal and professional communication.",
]
const tokenizedDocuments = rawDocuments.map(tokenize)
o("constructor throws if docIds and documents mismatch", () => {
// o(() => new DynamicTfVectorizer(["doc1"], [["token1"], ["token2"]])).throws(Error)
})
// o("builds correct vocabulary with filtered tokens", () => {
// const vectorizer = new DynamicTfVectorizer()
// vectorizer.initializeVocabulary(tokenizedDocuments)
// o(vectorizer.vocabulary.includes("tuta")).equals(true)
// o(vectorizer.vocabulary.includes("email")).equals(true)
// o(vectorizer.vocabulary.includes("a")).equals(false)
// })
// o("vectorize returns correct TF vector", () => {
// const vectorizer = new DynamicTfVectorizer()
// vectorizer.initializeVocabulary(tokenizedDocuments)
// const tokens = ["email", "encryption"]
// const vector = vectorizer.vectorize(tokens)
// o(vector.length).equals(vectorizer.featureVectorDimension)
//
// const emailIndex = vectorizer.vocabulary.includes("email")!
// const encryptionIndex = vectorizer.vocabulary.includes("encryption")!
// o(emailIndex).equals(true)
// o(encryptionIndex).equals(true)
// })
// o("transform returns correct tensor shape", () => {
// const vectorizer = new DynamicTfVectorizer()
// vectorizer.initializeVocabulary(tokenizedDocuments)
// const inputTokens = [
// ["privacy", "encryption"],
// ["user", "data"],
// ]
// const vector = vectorizer.transform(inputTokens)
//
// o(vector.length).equals(2)
// o(vector[0].length).equals(vectorizer.featureVectorDimension)
//
// const allZeros = Array.from(vector.flat()).every((v) => v === 0)
// o(allZeros).equals(false)
// })
// o("adds unknown words to vocabulary when still enough space", () => {
// const vectorizer = new DynamicTfVectorizer()
// vectorizer.initializeVocabulary(tokenizedDocuments)
// const tokens = ["hannover", "munich"]
// const vector = vectorizer.vectorize(tokens)
// const nonZero = vector.some((val) => val > 0)
// o(nonZero).equals(true)
// })
})