mirror of
https://github.com/tutao/tutanota.git
synced 2025-12-07 13:49:47 +00:00
We sync the spam training data encrypted through our server to make sure that all clients for a specific user behave the same when classifying mails. Additionally, this enables the spam classification in the webApp. We compress the training data vectors (see clientSpamTrainingDatum) before uploading to our server using SparseVectorCompressor.ts. When a user has the ClientSpamClassification enabled, the spam training data sync will happen for every mail received. ClientSpamTrainingDatum are not stored in the CacheStorage. No entityEvents are emitted for this type. However, we retrieve creations and updates for ClientSpamTrainingData through the modifiedClientSpamTrainingDataIndex. We calculate a threshold per classifier based on the dataset ham to spam ratio, we also subsample our training data to cap the ham to spam ratio within a certain limit. Co-authored-by: jomapp <17314077+jomapp@users.noreply.github.com> Co-authored-by: das <das@tutao.de> Co-authored-by: abp <abp@tutao.de> Co-authored-by: Kinan <104761667+kibibytium@users.noreply.github.com> Co-authored-by: sug <sug@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com>
40 lines
1.7 KiB
TypeScript
40 lines
1.7 KiB
TypeScript
import o from "@tutao/otest"
|
|
import { HashingVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/HashingVectorizer"
|
|
import { arrayEquals } from "@tutao/tutanota-utils"
|
|
import { spamClassifierTokenizer } from "../../../../../../src/common/api/common/utils/spamClassificationUtils/SpamMailProcessor"
|
|
|
|
o.spec("HashingVectorizer", () => {
|
|
const rawDocuments = [
|
|
"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
|
|
"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
|
|
"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
|
|
"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
|
|
"Millions of people choose Tuta to protect their personal and professional communication.",
|
|
]
|
|
|
|
const tokenizedDocuments = rawDocuments.map(spamClassifierTokenizer)
|
|
|
|
o("vectorize creates same vector for same tokens", async () => {
|
|
const vectorizer = new HashingVectorizer()
|
|
const tokens = ["privacy", "email", "data"]
|
|
const v1 = await vectorizer.vectorize(tokens)
|
|
const v2 = await vectorizer.vectorize(tokens)
|
|
o(arrayEquals(v1, v2)).equals(true)
|
|
})
|
|
|
|
o("vectorize handles empty input", async () => {
|
|
const vectorizer = new HashingVectorizer()
|
|
const vector = await vectorizer.vectorize([])
|
|
o(vector.every((v) => v === 0)).equals(true)
|
|
})
|
|
|
|
o("transform returns correct shape", async () => {
|
|
const vectorizer = new HashingVectorizer()
|
|
const tensor = await vectorizer.transform(tokenizedDocuments)
|
|
|
|
o(tensor.length).equals(tokenizedDocuments.length)
|
|
for (const vec of tensor) {
|
|
o(vec.length).equals(vectorizer.dimension)
|
|
}
|
|
})
|
|
})
|