tutanota/test/tests/api/worker/utils/spamClassification/SpamClassifierTest.ts

668 lines
24 KiB
TypeScript
Raw Normal View History

[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
import o from "@tutao/otest"
import fs from "node:fs"
import { parseCsv } from "../../../../../../src/common/misc/parsing/CsvParser"
import {
DEFAULT_PREPROCESS_CONFIGURATION,
SpamClassifier,
SpamTrainMailDatum,
} from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassifier"
import { OfflineStoragePersistence } from "../../../../../../src/mail-app/workerUtils/index/OfflineStoragePersistence"
import { matchers, object, when } from "testdouble"
import { assertNotNull, promiseMap } from "@tutao/tutanota-utils"
import { SpamClassificationInitializer } from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassificationInitializer"
import { CacheStorage } from "../../../../../../src/common/api/worker/rest/DefaultEntityRestCache"
import { mockAttribute } from "@tutao/tutanota-test-utils"
import "@tensorflow/tfjs-backend-cpu"
import { HashingVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/HashingVectorizer"
import { LayersModel, tensor1d } from "../../../../../../src/mail-app/workerUtils/spamClassification/tensorflow-custom"
import { createTestEntity } from "../../../../TestUtils"
import { MailTypeRef } from "../../../../../../src/common/api/entities/tutanota/TypeRefs"
import { Sequential } from "@tensorflow/tfjs-layers"
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const { anything } = matchers
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
export const DATASET_FILE_PATH: string = "./tests/api/worker/utils/spamClassification/spam_classification_test_mails.csv"
export async function readMailDataFromCSV(filePath: string): Promise<{
spamData: SpamTrainMailDatum[]
hamData: SpamTrainMailDatum[]
}> {
const file = await fs.promises.readFile(filePath)
const csv = parseCsv(file.toString())
let spamData: SpamTrainMailDatum[] = []
let hamData: SpamTrainMailDatum[] = []
for (const row of csv.rows.slice(1, csv.rows.length - 1)) {
const subject = row[8]
const body = row[10]
const label = row[11]
const from = row[0]
const to = row[1]
const cc = row[2]
const bcc = row[3]
const authStatus = row[4]
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
let isSpam = label === "spam" ? true : label === "ham" ? false : null
isSpam = assertNotNull(isSpam, "Unknown label detected: " + label)
const targetData = isSpam ? spamData : hamData
targetData.push({
mailId: ["mailListId", "mailElementId"],
subject,
body,
isSpam,
isSpamConfidence: 1,
ownerGroup: "owner",
sender: from,
toRecipients: to,
ccRecipients: cc,
bccRecipients: bcc,
authStatus: authStatus,
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
} as SpamTrainMailDatum)
}
return { spamData, hamData }
}
// Initial training (cutoff by day or amount)
o.spec("SpamClassifierTest", () => {
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const mockOfflineStorageCache = object<CacheStorage>()
const mockOfflineStorage = object<OfflineStoragePersistence>()
const mockSpamClassificationInitializer = object<SpamClassificationInitializer>()
let nonEfficientSmallVectorizer: HashingVectorizer
let spamClassifier: SpamClassifier
let spamData: SpamTrainMailDatum[]
let hamData: SpamTrainMailDatum[]
let dataSlice: SpamTrainMailDatum[]
o.beforeEach(async () => {
const spamHamData = await readMailDataFromCSV(DATASET_FILE_PATH)
spamData = spamHamData.spamData
hamData = spamHamData.hamData
dataSlice = spamData.concat(hamData)
seededShuffle(dataSlice, 42)
mockSpamClassificationInitializer.init = async () => {
return dataSlice
}
nonEfficientSmallVectorizer = new HashingVectorizer(512)
spamClassifier = new SpamClassifier(
mockOfflineStorage,
mockOfflineStorageCache,
mockSpamClassificationInitializer,
true,
DEFAULT_PREPROCESS_CONFIGURATION,
nonEfficientSmallVectorizer,
)
})
o("processSpam maintains server classification when client classification is not enabled", async function () {
const mail = createTestEntity(MailTypeRef, {
_id: ["mailListId", "mailId"],
sets: [["folderList", "serverFolder"]],
})
const spamTrainMailDatum: SpamTrainMailDatum = {
mailId: mail._id,
subject: mail.subject,
body: "some body",
isSpam: true,
isSpamConfidence: 1,
ownerGroup: "owner",
sender: "",
toRecipients: "",
ccRecipients: "",
bccRecipients: "",
authStatus: "",
}
const layersModel = object<Sequential>()
spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, false)
const predictedSpam = await spamClassifier.predict(spamTrainMailDatum)
o(predictedSpam).equals(null)
})
o("processSpam uses client classification when enabled", async function () {
const mail = createTestEntity(MailTypeRef, {
_id: ["mailListId", "mailId"],
sets: [["folderList", "serverFolder"]],
})
const spamTrainMailDatum: SpamTrainMailDatum = {
mailId: mail._id,
subject: mail.subject,
body: "some body",
isSpam: false,
isSpamConfidence: 0,
ownerGroup: "owner",
sender: "",
toRecipients: "",
ccRecipients: "",
bccRecipients: "",
authStatus: "",
}
const layersModel = object<Sequential>()
when(layersModel.predict(anything())).thenReturn(tensor1d([1]))
spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, true)
const predictedSpam = await spamClassifier.predict(spamTrainMailDatum)
o(predictedSpam).equals(true)
})
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
o("Initial training only", async () => {
o.timeout(20_000)
const trainTestSplit = dataSlice.length * 0.8
const trainSet = dataSlice.slice(0, trainTestSplit)
const testSet = dataSlice.slice(trainTestSplit)
await spamClassifier.initialTraining(trainSet)
await testClassifier(spamClassifier, testSet)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
})
o("Initial training and refitting in multi step", async () => {
o.timeout(20_000)
const trainTestSplit = dataSlice.length * 0.8
const trainSet = dataSlice.slice(0, trainTestSplit)
const testSet = dataSlice.slice(trainTestSplit)
const trainSetFirstHalf = trainSet.slice(0, trainSet.length / 2)
const trainSetSecondHalf = trainSet.slice(trainSet.length / 2, trainSet.length)
dataSlice = trainSetFirstHalf
o(await mockSpamClassificationInitializer.init("owner")).deepEquals(trainSetFirstHalf)
await spamClassifier.initialTraining(dataSlice)
console.log(`==> Result when testing with mails in two steps (first step).`)
await testClassifier(spamClassifier, testSet)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
await spamClassifier.updateModel("owner", trainSetSecondHalf)
console.log(`==> Result when testing with mails in two steps (second step).`)
await testClassifier(spamClassifier, testSet)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
})
o("preprocessMail outputs expected tokens for mail content", async () => {
const classifier = new SpamClassifier(object(), object(), object())
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const mail = {
subject: `Sample Tokens and values`,
sender: "sender",
toRecipients: "toRecipients",
ccRecipients: "ccRecipients",
bccRecipients: "bccRecipients",
authStatus: "authStatus",
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
// prettier-ignore
body: `Hello, these are my MAC Address
FB-94-77-45-96-74
91-58-81-D5-55-7C
B4-09-49-2A-DE-D4
along with my ISBNs
718385414-0
733065633-X
632756390-2
SSN
227-78-2283
134-34-1253
591-61-6459
SHAs
585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
Phone Numbers
(341) 2027690
+385 958 638 7625
430-284-9438
VIN (Vehicle identification number)
3FADP4AJ3BM438397
WAULT64B82N564937
GUIDs
781a9631-0716-4f9c-bb36-25c3364b754b
325783d4-a64e-453b-85e6-ed4b2cd4c9bf
Hex Colors
#2016c1
#c090a4
#c855f5
#000000
IPV4
91.17.182.120
47.232.175.0
171.90.3.93
On Date:
01-12-2023
1-12-2023
Not Date
2023/12-1
URL
https://tuta.com
https://subdomain.microsoft.com/outlook/test
NOT URL
https://tuta/com
MAIL
test@example.com
plus+addressing@example.com
Credit Card
5002355116026522
4041 3751 9030 3866
Not Credit Card
1234 1234
Bit Coin Address
159S1vV25PAxMiCVaErjPznbWB8YBvANAi
1NJmLtKTyHyqdKo6epyF9ecMyuH1xFWjEt
Not BTC
5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
Special Characters
!
@
Not Special Character
§
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
Number Sequences:
26098375
IBAN: DE91 1002 0370 0320 2239 82
Not Number Sequences
SHLT116
gb_67ca4b
Other values found in mails
5.090 € 37 m 1 Zi 100%
Fax (089) 13 33 87 88
August 12, 2025
5:20 PM - 5:25 PM
<this gets removed by HTML as it should use &lt; to represent the character>
and all text on other lines it seems.
<div>
<a rel="noopener noreferrer" target="_blank" href="https://www.somewebsite.de/?key=c2f395513421312029680" style="background-color:#055063;border-radius:3px;color:#ffffff;display:inline-block;font-size: 14px; font-family: sans-serif;font-weight:bold;line-height:36px;height:36px;text-align:center;text-decoration:none;width:157px;-webkit-text-size-adjust:none; margin-bottom:20px">Button Text</a>
</div>
<table cellpadding="0" cellspacing="0" border="0" role="presentation" width="100%"><tbody><tr><td align="center"><a href="https://mail.abc-web.de/optiext/optiextension.dll?ID=someid" rel="noopener noreferrer" target="_blank" style="text-decoration:none"><img id="OWATemporaryImageDivContainer1" src="https://mail.some-domain.de/images/SMC/grafik/image.png" alt="" border="0" class="" width="100%" style="max-width:100%;display:block;width:100%"></a></td></tr></tbody></table>
this text is shown
`,
} as SpamTrainMailDatum
const preprocessedMail = classifier.preprocessMail(mail)
// prettier-ignore
const expectedOutput = `Sample Tokens and values
Hello TSPECIALCHAR these are my MAC Address
\t\t\t\tFB TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR D5 TSPECIALCHAR TNUMBER TSPECIALCHAR 7C
\t\t\t\tB4 TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR 2A TSPECIALCHAR DE TSPECIALCHAR D4
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\talong with my ISBNs
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR X
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tSSN
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tSHAs
\t\t\t\t585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
\t\t\t\t7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
\t\t\t\t769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
\t\t\t\tPhone Numbers
\t\t\t\t TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\tVIN TSPECIALCHAR Vehicle identification number TSPECIALCHAR
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\t3FADP4AJ3BM438397
\t\t\t\tWAULT64B82N564937
\t\t\t\tGUIDs
\t\t\t\t781a9631 TSPECIALCHAR TNUMBER TSPECIALCHAR 4f9c TSPECIALCHAR bb36 TSPECIALCHAR 25c3364b754b
\t\t\t\t325783d4 TSPECIALCHAR a64e TSPECIALCHAR 453b TSPECIALCHAR 85e6 TSPECIALCHAR ed4b2cd4c9bf
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tHex Colors
\t\t\t\t TSPECIALCHAR 2016c1
\t\t\t\t TSPECIALCHAR c090a4
\t\t\t\t TSPECIALCHAR c855f5
\t\t\t\t TSPECIALCHAR TNUMBER
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tIPV4
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\tOn Date TSPECIALCHAR
\t\t\t\t TDATE
\t\t\t\t TDATE
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tNot Date
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tURL
\t\t\t\t TURLtuta TSPECIALCHAR com
\t\t\t\t TURLsubdomain TSPECIALCHAR microsoft TSPECIALCHAR com
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tNOT URL
\t\t\t\t TURLtuta
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tMAIL
\t\t\t\t TEMAIL
\t\t\t\t TEMAIL
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tCredit Card
\t\t\t\t TCREDITCARD
\t\t\t\t TCREDITCARD
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tNot Credit Card
\t\t\t\t TNUMBER TNUMBER
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tBit Coin Address
\t\t\t\t TBITCOIN
\t\t\t\t TBITCOIN
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tNot BTC
\t\t\t\t5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
\t\t\t\t1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
\t\t\t\tSpecial Characters
\t\t\t\t TSPECIALCHAR
\t\t\t\t TSPECIALCHAR
\t\t\t\tNot Special Character
\t\t\t\t§
\t\t\t\tNumber Sequences TSPECIALCHAR
\t\t\t\t TNUMBER
\t\t\t\tIBAN TSPECIALCHAR DE91 TCREDITCARD TNUMBER
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tNot Number Sequences
\t\t\t\tSHLT116
\t\t\t\tgb TSPECIALCHAR 67ca4b
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
\t\t\t\tOther values found in mails
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER € TNUMBER m TNUMBER Zi TNUMBER TSPECIALCHAR
\t\t\t\tFax TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
\t\t\t\tAugust TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER PM TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER PM
\t\t\t\tand all text on other lines it seems TSPECIALCHAR
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
Button Text
this text is shown
sender
toRecipients
ccRecipients
bccRecipients
authStatus`
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
o.check(preprocessedMail).equals(expectedOutput)
})
o("predict uses different models for different owner groups", async () => {
const firstGroupModel = object<LayersModel>()
const secondGroupModel = object<LayersModel>()
mockAttribute(spamClassifier, spamClassifier.loadModel, (ownerGroup) => {
if (ownerGroup === "firstGroup") {
return Promise.resolve(firstGroupModel)
} else if (ownerGroup === "secondGroup") {
return Promise.resolve(secondGroupModel)
}
return null
})
mockAttribute(spamClassifier, spamClassifier.updateAndSaveModel, () => {
return Promise.resolve()
})
const firstGroupReturnTensor = tensor1d([1.0], undefined)
when(firstGroupModel.predict(matchers.anything())).thenReturn(firstGroupReturnTensor)
const secondGroupReturnTensor = tensor1d([0.0], undefined)
when(secondGroupModel.predict(matchers.anything())).thenReturn(secondGroupReturnTensor)
await spamClassifier.initialize("firstGroup")
await spamClassifier.initialize("secondGroup")
const commonSpamFields = {
subject: "",
body: "",
sender: "string",
toRecipients: "string",
ccRecipients: "string",
bccRecipients: "string",
authStatus: "",
}
const isSpamFirstMail = await spamClassifier.predict({
ownerGroup: "firstGroup",
...commonSpamFields,
})
const isSpamSecondMail = await spamClassifier.predict({
ownerGroup: "secondGroup",
...commonSpamFields,
})
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
o(isSpamFirstMail).equals(true)
o(isSpamSecondMail).equals(false)
// manually dispose @tensorflow tensors to save memory
firstGroupReturnTensor.dispose()
secondGroupReturnTensor.dispose()
})
})
// These are rather analysis instead of test
// They run in loop hence do take more time to finish and is not necessary to include in CI test suite
//
// To enable running this, change following constant to true
const DO_RUN_PERFORMANCE_ANALYSIS = false
if (DO_RUN_PERFORMANCE_ANALYSIS) {
o.spec("SpamClassifier - Performance Analysis", () => {
const mockOfflineStorageCache = object<CacheStorage>()
const mockOfflineStorage = object<OfflineStoragePersistence>()
let spamClassifier = object<SpamClassifier>()
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
let dataSlice: SpamTrainMailDatum[]
o.beforeEach(() => {
const mockSpamClassificationInitializer = object<SpamClassificationInitializer>()
mockSpamClassificationInitializer.init = async () => {
return dataSlice
}
spamClassifier = new SpamClassifier(mockOfflineStorage, mockOfflineStorageCache, mockSpamClassificationInitializer)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
})
o("time to refit", async () => {
o.timeout(20_000_000)
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
const hamSlice = hamData.slice(0, 1000)
const spamSlice = spamData.slice(0, 400)
dataSlice = hamSlice.concat(spamSlice)
seededShuffle(dataSlice, 42)
const start = performance.now()
await spamClassifier.initialTraining(dataSlice)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const initialTrainingDuration = performance.now() - start
console.log(`initial training time ${initialTrainingDuration}ms`)
for (let i = 0; i < 20; i++) {
const nowSpam = [hamSlice[0]]
nowSpam.map((formerHam) => (formerHam.isSpam = true))
const retrainingStart = performance.now()
await spamClassifier.updateModel("owner", nowSpam)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const retrainingDuration = performance.now() - retrainingStart
console.log(`retraining time ${retrainingDuration}ms`)
}
})
o("refit after moving a false negative classification multiple times", async () => {
o.timeout(20_000_000)
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
const hamSlice = hamData.slice(0, 100)
const spamSlice = spamData.slice(0, 10)
dataSlice = hamSlice.concat(spamSlice)
// seededShuffle(dataSlice, 42)
await spamClassifier.initialTraining(dataSlice)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const falseNegatives = spamData
.slice(10)
.filter(async (mailDatum) => mailDatum.isSpam !== (await spamClassifier.predict(mailDatum)))
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
.sort()
.slice(0, 10)
let retrainingNeeded = new Array<number>(falseNegatives.length).fill(0)
for (let i = 0; i < falseNegatives.length; i++) {
const sample = falseNegatives[i]
const copiedClassifier = await spamClassifier.cloneClassifier()
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
let retrainCount = 0
let predictedSpam = false
while (!predictedSpam && retrainCount++ <= 3) {
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true, isSpamConfidence: 1 }])
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
}
retrainingNeeded[i] = retrainCount
}
console.log(retrainingNeeded)
const maxRetrain = Math.max(...retrainingNeeded)
o.check(retrainingNeeded.length >= 10).equals(true)
o.check(maxRetrain < 3).equals(true)
})
o("refit after moving a false positive classification multiple times", async () => {
o.timeout(20_000_000)
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
const hamSlice = hamData.slice(0, 10)
const spamSlice = spamData.slice(0, 100)
dataSlice = hamSlice.concat(spamSlice)
// seededShuffle(dataSlice, 42)
await spamClassifier.initialTraining(dataSlice)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const falsePositive = hamData
.slice(10)
.filter(async (mailDatum) => mailDatum.isSpam !== (await spamClassifier.predict(mailDatum)))
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
.slice(0, 10)
let retrainingNeeded = new Array<number>(falsePositive.length).fill(0)
for (let i = 0; i < falsePositive.length; i++) {
const sample = falsePositive[i]
const copiedClassifier = await spamClassifier.cloneClassifier()
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
let retrainCount = 0
let predictedSpam = false
while (!predictedSpam && retrainCount++ <= 10) {
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true }])
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: false }])
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
}
retrainingNeeded[i] = retrainCount
}
console.log(retrainingNeeded)
const maxRetrain = Math.max(...retrainingNeeded)
o.check(retrainingNeeded.length >= 10).equals(true)
o.check(maxRetrain < 3).equals(true)
})
o("retrain after moving a false negative classification multiple times", async () => {
o.timeout(20_000_000)
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
const hamSlice = hamData.slice(0, 100)
const spamSlice = spamData.slice(0, 10)
dataSlice = hamSlice.concat(spamSlice)
seededShuffle(dataSlice, 42)
await spamClassifier.initialTraining(dataSlice)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
const falseNegatives = spamData
.slice(10)
.filter(async (mailDatum) => mailDatum.isSpam !== (await spamClassifier.predict(mailDatum)))
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
.slice(0, 10)
let retrainingNeeded = new Array<number>(falseNegatives.length).fill(0)
for (let i = 0; i < falseNegatives.length; i++) {
const sample = falseNegatives[i]
const copiedClassifier = await spamClassifier.cloneClassifier()
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
let retrainCount = 0
let predictedSpam = false
while (!predictedSpam && retrainCount++ <= 10) {
await copiedClassifier.initialTraining([...dataSlice, sample])
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
}
retrainingNeeded[i] = retrainCount
}
console.log(retrainingNeeded)
const maxRetrain = Math.max(...retrainingNeeded)
o.check(retrainingNeeded.length >= 10).equals(true)
o.check(maxRetrain < 3).equals(true)
})
o("Time spent in vectorization during initial training", async () => {
o.timeout(2_000_000)
const ITERATION_COUNT: number = 1
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
dataSlice = spamData.concat(hamData)
let trainingTimes = new Array<number>()
let vectorizationTimes = new Array<number>()
let trainingWithoutVectorization = new Array<number>()
await promiseMap(
new Array<number>(ITERATION_COUNT).fill(0),
async () => {
const { vectorizationTime, trainingTime } = await spamClassifier.initialTraining(dataSlice)
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
trainingTimes.push(trainingTime)
vectorizationTimes.push(vectorizationTime)
trainingWithoutVectorization.push(trainingTime - vectorizationTime)
},
{ concurrency: ITERATION_COUNT },
)
trainingTimes = trainingTimes.sort()
vectorizationTimes = vectorizationTimes.sort()
trainingWithoutVectorization = trainingWithoutVectorization.sort()
const avgTrainingTime = trainingTimes.reduce((a, b) => a + b, 0) / trainingTimes.length
const avgVectorizationTime = vectorizationTimes.reduce((a, b) => a + b, 0) / vectorizationTimes.length
const avgTrainingWithoutVectorization = trainingWithoutVectorization.reduce((a, b) => a + b, 0) / trainingWithoutVectorization.length
console.log("For vectorization:")
console.log({ min: vectorizationTimes.at(0), max: vectorizationTimes.at(-1), avg: avgVectorizationTime })
console.log("For whole training:")
console.log({ min: trainingTimes.at(0), max: trainingTimes.at(-1), avg: avgTrainingTime })
console.log("For training without vectorization:")
console.log({
min: trainingWithoutVectorization.at(0),
max: trainingWithoutVectorization.at(-1),
avg: avgTrainingWithoutVectorization,
})
})
})
}
async function testClassifier(classifier: SpamClassifier, mails: SpamTrainMailDatum[]): Promise<void> {
let predictionArray: number[] = []
for (let mail of mails) {
const prediction = await classifier.predict(mail)
predictionArray.push(prediction ? 1 : 0)
}
const ysArray = mails.map((mail) => mail.isSpam)
let tp = 0,
tn = 0,
fp = 0,
fn = 0
for (let i = 0; i < predictionArray.length; i++) {
const predictedSpam = predictionArray[i] > 0.5
const isActuallyASpam = ysArray[i]
if (predictedSpam && isActuallyASpam) tp++
else if (!predictedSpam && !isActuallyASpam) tn++
else if (predictedSpam && !isActuallyASpam) fp++
else if (!predictedSpam && isActuallyASpam) fn++
}
const total = tp + tn + fp + fn
const accuracy = (tp + tn) / total
const precision = tp / (tp + fp + 1e-7)
const recall = tp / (tp + fn + 1e-7)
const f1 = 2 * ((precision * recall) / (precision + recall + 1e-7))
console.log("\n--- Evaluation Metrics ---")
console.log(`Accuracy: \t${(accuracy * 100).toFixed(2)}%`)
console.log(`Precision:\t${(precision * 100).toFixed(2)}%`)
console.log(`Recall: \t${(recall * 100).toFixed(2)}%`)
console.log(`F1 Score: \t${(f1 * 100).toFixed(2)}%`)
console.log("\nConfusion Matrix:")
console.log({
Predicted_Spam: { True_Positive: tp, False_Positive: fp },
Predicted_Ham: { False_Negative: fn, True_Negative: tn },
})
}
[antispam] Add client-side local spam filtering Implement a local machine learning model for client-side spam filtering. The local model is implemented using tensorflow "LayersModel" to train separate models in all available mailboxes, resulting in one model per ownerGroup (i.e. mailbox). Initially, the training data is aggregated from the last 30 days of received mails, and the data is stored in a separate offline database table named spam_classification_training_data. The trained model is stored in the table spam_classification_model. The initial training starts after indexing, with periodic training happening every 30 minutes and on each subsequent login. The model will predict on incoming mails once we have received the entity event for said mail, moving it to either inbox or spam folder. When users move mails, we update the training data labels accordingly, by adjusting the isSpam classification and isSpamConfidence values in the offline database. The MoveMailService now contains a moveReason, which indicates that the mail has been moved by our spam filter. Client-side spam filtering can be activated using the SpamClientClassification feature flag, and is for now only available on the desktop client. Co-authored-by: sug <sug@tutao.de> Co-authored-by: kib <104761667+kibibytium@users.noreply.github.com> Co-authored-by: abp <abp@tutao.de> Co-authored-by: map <mpfau@users.noreply.github.com> Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com> Co-authored-by: frm <frm@tutao.de> Co-authored-by: das <das@tutao.de> Co-authored-by: nif <nif@tutao.de> Co-authored-by: amm <amm@tutao.de>
2025-10-14 12:32:17 +02:00
// For testing, we need deterministic shuffling which is not provided by tf.util.shuffle(dataSlice)
// Seeded Fisher-Yates shuffle
function seededShuffle<T>(array: T[], seed: number): void {
const random = seededRandom(seed)
for (let i = array.length - 1; i > 0; i--) {
const j = Math.floor(random() * (i + 1))
;[array[i], array[j]] = [array[j], array[i]]
}
}
function seededRandom(seed: number): () => number {
const m = 0x80000000 // 2^31
const a = 1103515245
const c = 12345
let state = seed
return function (): number {
state = (a * state + c) % m
return state / m
}
}