tutanota/test/tests/api/worker/utils/spamClassification/HashingVectorizerTest.ts
das fd22294a18
[antispam] Add client-side local spam filtering
Implement a local machine learning model for client-side spam filtering.
The local model is implemented using tensorflow "LayersModel" to train
separate models in all available mailboxes, resulting in one model
per ownerGroup (i.e. mailbox).

Initially, the training data is aggregated from the last 30 days of
received mails, and the data is stored in a separate offline database
table named spam_classification_training_data. The trained model is
stored in the table spam_classification_model. The initial training
starts after indexing, with periodic training happening
every 30 minutes and on each subsequent login.

The model will predict on incoming mails once we have received the
entity event for said mail, moving it to either inbox or spam folder.
When users move mails, we update the training data labels accordingly,
by adjusting the isSpam classification and isSpamConfidence values in
the offline database. The MoveMailService now contains a moveReason,
which indicates that the mail has been moved by our spam filter.

Client-side spam filtering can be activated using the
SpamClientClassification feature flag, and is for now only
available on the desktop client.

Co-authored-by: sug <sug@tutao.de>
Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com>
Co-authored-by: abp <abp@tutao.de>
Co-authored-by: map <mpfau@users.noreply.github.com>
Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com>
Co-authored-by: frm <frm@tutao.de>
Co-authored-by: das <das@tutao.de>
Co-authored-by: nif <nif@tutao.de>
Co-authored-by: amm <amm@tutao.de>
2025-10-22 09:25:20 +02:00

45 lines
1.7 KiB
TypeScript

import o from "@tutao/otest"
import { HashingVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/HashingVectorizer"
import { arrayEquals } from "@tutao/tutanota-utils"
export const tokenize = (text: string): string[] =>
text
.toLowerCase()
.split(/\s+/)
.filter((t) => t.length > 1)
o.spec("HashingVectorizer", () => {
const rawDocuments = [
"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
"Millions of people choose Tuta to protect their personal and professional communication.",
]
const tokenizedDocuments = rawDocuments.map(tokenize)
o("vectorize creates same vector for same tokens", async () => {
const vectorizer = new HashingVectorizer()
const tokens = ["privacy", "email", "data"]
const v1 = await vectorizer.vectorize(tokens)
const v2 = await vectorizer.vectorize(tokens)
o(arrayEquals(v1, v2)).equals(true)
})
o("vectorize handles empty input", async () => {
const vectorizer = new HashingVectorizer()
const vector = await vectorizer.vectorize([])
o(vector.every((v) => v === 0)).equals(true)
})
o("transform returns correct shape", async () => {
const vectorizer = new HashingVectorizer()
const tensor = await vectorizer.transform(tokenizedDocuments)
o(tensor.length).equals(tokenizedDocuments.length)
for (const vec of tensor) {
o(vec.length).equals(vectorizer.dimension)
}
})
})