mirror of
https://github.com/tutao/tutanota.git
synced 2025-12-08 06:09:50 +00:00
Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
45 lines
1.7 KiB
TypeScript
45 lines
1.7 KiB
TypeScript
import o from "@tutao/otest"
|
|
import { HashingVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/HashingVectorizer"
|
|
import { arrayEquals } from "@tutao/tutanota-utils"
|
|
|
|
export const tokenize = (text: string): string[] =>
|
|
text
|
|
.toLowerCase()
|
|
.split(/\s+/)
|
|
.filter((t) => t.length > 1)
|
|
|
|
o.spec("HashingVectorizer", () => {
|
|
const rawDocuments = [
|
|
"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
|
|
"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
|
|
"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
|
|
"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
|
|
"Millions of people choose Tuta to protect their personal and professional communication.",
|
|
]
|
|
|
|
const tokenizedDocuments = rawDocuments.map(tokenize)
|
|
|
|
o("vectorize creates same vector for same tokens", async () => {
|
|
const vectorizer = new HashingVectorizer()
|
|
const tokens = ["privacy", "email", "data"]
|
|
const v1 = await vectorizer.vectorize(tokens)
|
|
const v2 = await vectorizer.vectorize(tokens)
|
|
o(arrayEquals(v1, v2)).equals(true)
|
|
})
|
|
|
|
o("vectorize handles empty input", async () => {
|
|
const vectorizer = new HashingVectorizer()
|
|
const vector = await vectorizer.vectorize([])
|
|
o(vector.every((v) => v === 0)).equals(true)
|
|
})
|
|
|
|
o("transform returns correct shape", async () => {
|
|
const vectorizer = new HashingVectorizer()
|
|
const tensor = await vectorizer.transform(tokenizedDocuments)
|
|
|
|
o(tensor.length).equals(tokenizedDocuments.length)
|
|
for (const vec of tensor) {
|
|
o(vec.length).equals(vectorizer.dimension)
|
|
}
|
|
})
|
|
})
|