Include header fields as tokens in the anti-spam

Add the header fields(sender, toRecipients, ccRecipients, bccRecipients, authStatus) to the anti-spam vectors. We also improve some of the preprocessing steps and add offline migrations by deleting old spam tables Co-authored-by: amm@tutao.de Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com>
2025-12-07 13:49:47 +00:00 · 2025-10-22 16:18:24 +02:00 · 2025-10-22 16:18:24 +02:00 · f8bbd32695
commit f8bbd32695
parent 21ad4ce2c3
13 changed files with 10918 additions and 10788 deletions
--- a/buildSrc/RollupConfig.js
+++ b/buildSrc/RollupConfig.js
@ -46,7 +46,7 @@ export const allowedImports = {
 	contacts: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "mail-view", "date", "date-gui", "mail-editor"],
 	"calendar-view": ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "date", "date-gui", "sharing", "contacts"],
 	login: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main"],
-	"spam-classifier": ["polyfill-helpers", "common", "common-min"],
+	"spam-classifier": ["polyfill-helpers", "common", "common-min", "main"],
 	worker: ["polyfill-helpers", "common-min", "common", "native-common", "native-worker", "wasm", "wasm-fallback"],
 	"pow-worker": [],
 	settings: [
--- a/src/common/api/worker/offline/OfflineStorage.ts
+++ b/src/common/api/worker/offline/OfflineStorage.ts
@ -46,9 +46,6 @@ import { AttributeModel } from "../../common/AttributeModel"
 import { TypeModelResolver } from "../../common/EntityFunctions"
 import { collapseId, expandId } from "../rest/RestClientIdUtils"
 import { Category, syncMetrics } from "../utils/SyncMetrics"
-import { hasError } from "../../common/utils/ErrorUtils"
-import { SpamClassificationModel, SpamTrainMailDatum } from "../../../../mail-app/workerUtils/spamClassification/SpamClassifier"
-import { Mail } from "../../entities/tutanota/TypeRefs"

 /**
 * this is the value of SQLITE_MAX_VARIABLE_NUMBER in sqlite3.c
--- a/src/common/api/worker/offline/OfflineStorageMigrator.ts
+++ b/src/common/api/worker/offline/OfflineStorageMigrator.ts
@ -7,6 +7,7 @@ import { offline6 } from "./migrations/offline-v6"
 import { offline7 } from "./migrations/offline-v7"
 import { offline8 } from "./migrations/offline-v8"
 import { ProgrammingError } from "../../common/error/ProgrammingError"
+import { offline9 } from "./migrations/offline-v9"

 export interface OfflineMigration {
 	readonly version: number
@ -20,11 +21,11 @@ export interface OfflineMigration {
 * Normally you should only add them to the end of the list but with offline ones it can be a bit tricky since they change the db structure itself so sometimes
 * they should rather be in the beginning.
 */
-export const OFFLINE_STORAGE_MIGRATIONS: ReadonlyArray<OfflineMigration> = [offline5, offline6, offline7, offline8]
+export const OFFLINE_STORAGE_MIGRATIONS: ReadonlyArray<OfflineMigration> = [offline5, offline6, offline7, offline8, offline9]

 // in cases where the actual migration is not there anymore (we clean up old migrations no client would apply anymore)
 // and we create a new offline database, we still need to set the offline version to the current value.
-export const CURRENT_OFFLINE_VERSION = 8
+export const CURRENT_OFFLINE_VERSION = 9

 /**
 * Migrator for the offline storage between different versions of model. It is tightly couples to the versions of API entities: every time we make an
--- a/src/common/api/worker/offline/migrations/offline-v9.ts
+++ b/src/common/api/worker/offline/migrations/offline-v9.ts
@ -0,0 +1,12 @@
+import { OfflineMigration } from "../OfflineStorageMigrator.js"
+import { OfflineStorage } from "../OfflineStorage.js"
+import { SqlCipherFacade } from "../../../../native/common/generatedipc/SqlCipherFacade"
+
+export const offline9: OfflineMigration = {
+	version: 9,
+	async migrate(storage: OfflineStorage, sqlCipherFacade: SqlCipherFacade) {
+		console.log("dropping spam_classification_training_data and spam_classification_model, due to new fields")
+		await sqlCipherFacade.run(`DROP TABLE IF EXISTS spam_classification_training_data`, [])
+		await sqlCipherFacade.run(`DROP TABLE IF EXISTS spam_classification_model`, [])
+	},
+}
--- a/src/mail-app/mail/model/SpamClassificationHandler.ts
+++ b/src/mail-app/mail/model/SpamClassificationHandler.ts
@ -1,13 +1,14 @@
-import { createMoveMailData, Mail, MailDetails, MailFolder, MoveMailData } from "../../../common/api/entities/tutanota/TypeRefs"
+import { createMoveMailData, Mail, MailAddress, MailDetails, MailFolder, MoveMailData } from "../../../common/api/entities/tutanota/TypeRefs"
 import {
 	DEFAULT_IS_SPAM,
 	DEFAULT_IS_SPAM_CONFIDENCE,
 	getSpamConfidence,
+	MailAuthenticationStatus,
 	MailSetKind,
 	ProcessingState,
 	SpamDecision,
 } from "../../../common/api/common/TutanotaConstants"
-import type { SpamClassifier, SpamPredMailDatum, SpamTrainMailDatum } from "../../workerUtils/spamClassification/SpamClassifier"
+import { SpamClassifier, SpamPredMailDatum, SpamTrainMailDatum } from "../../workerUtils/spamClassification/SpamClassifier"
 import { getMailBodyText } from "../../../common/api/common/CommonMailUtils"
 import { assertNotNull, debounce, isNotNull, Nullable, ofClass } from "@tutao/tutanota-utils"
 import { MailFacade } from "../../../common/api/worker/facades/lazy/MailFacade"
@ -75,6 +76,7 @@ export class SpamClassificationHandler {
 			subject: mail.subject,
 			body: getMailBodyText(mailDetails.body),
 			ownerGroup: assertNotNull(mail._ownerGroup),
+			...extractSpamHeaderFeatures(mail, mailDetails),
 		}
 		const isSpam = (await this.spamClassifier?.predict(spamPredMailDatum)) ?? null

@ -141,7 +143,44 @@ export class SpamClassificationHandler {
 			isSpam: DEFAULT_IS_SPAM,
 			isSpamConfidence: DEFAULT_IS_SPAM_CONFIDENCE,
 			ownerGroup: assertNotNull(mail._ownerGroup),
+			...extractSpamHeaderFeatures(mail, mailDetails),
 		}
 		await this.spamClassifier?.storeSpamClassification(spamTrainMailDatum)
 	}
 }
+
+export function extractSpamHeaderFeatures(mail: Mail, mailDetails: MailDetails) {
+	const sender = joinNamesAndMailAddresses([mail?.sender])
+	const { toRecipients, ccRecipients, bccRecipients } = extractRecipients(mailDetails)
+	const authStatus = convertAuthStatusToSpamCategorizationToken(mail.authStatus)
+
+	return { sender, toRecipients, ccRecipients, bccRecipients, authStatus }
+}
+
+function extractRecipients({ recipients }: MailDetails) {
+	const toRecipients = joinNamesAndMailAddresses(recipients?.toRecipients)
+	const ccRecipients = joinNamesAndMailAddresses(recipients?.ccRecipients)
+	const bccRecipients = joinNamesAndMailAddresses(recipients?.bccRecipients)
+
+	return { toRecipients, ccRecipients, bccRecipients }
+}
+
+function joinNamesAndMailAddresses(recipients: MailAddress[] | null) {
+	return recipients?.map((recipient) => `${recipient?.name} ${recipient?.address}`).join(" ") || ""
+}
+
+function convertAuthStatusToSpamCategorizationToken(authStatus: string | null): string {
+	if (authStatus === MailAuthenticationStatus.AUTHENTICATED) {
+		return "TAUTHENTICATED"
+	} else if (authStatus === MailAuthenticationStatus.HARD_FAIL) {
+		return "THARDFAIL"
+	} else if (authStatus === MailAuthenticationStatus.SOFT_FAIL) {
+		return "TSOFTFAIL"
+	} else if (authStatus === MailAuthenticationStatus.INVALID_MAIL_FROM) {
+		return "TINVALIDMAILFROM"
+	} else if (authStatus === MailAuthenticationStatus.MISSING_MAIL_FROM) {
+		return "TMISSINGMAILFROM"
+	}
+
+	return ""
+}
--- a/src/mail-app/workerUtils/index/OfflineStoragePersistence.ts
+++ b/src/mail-app/workerUtils/index/OfflineStoragePersistence.ts
@ -67,12 +67,14 @@ export const SearchTableDefinitions: Record<string, OfflineStorageTable> = Objec
 })

 export const SpamClassificationDefinitions: Record<string, OfflineStorageTable> = Object.freeze({
-	// Spam classification training data
 	spam_classification_training_data: {
 		definition:
 			"CREATE TABLE IF NOT EXISTS spam_classification_training_data (listId TEXT NOT NULL, elementId TEXT NOT NULL," +
-			" ownerGroup TEXT NOT NULL, subject TEXT NOT NULL, body TEXT NOT NULL, isSpam NUMBER, " +
-			"lastModified NUMBER NOT NULL, isSpamConfidence NUMBER NOT NULL, PRIMARY KEY (listId, elementId))",
+			"ownerGroup TEXT NOT NULL, subject TEXT NOT NULL, body TEXT NOT NULL, isSpam NUMBER," +
+			"lastModified NUMBER NOT NULL, isSpamConfidence NUMBER NOT NULL, sender TEXT NOT NULL," +
+			"toRecipients TEXT NOT NULL, ccRecipients TEXT NOT NULL, bccRecipients TEXT NOT NULL," +
+			"authStatus TEXT NOT NULL, PRIMARY KEY (listId, elementId))",
+
 		purgedWithCache: true,
 	},

@ -187,18 +189,24 @@ export class OfflineStoragePersistence {

 	async storeSpamClassification(spamTrainMailDatum: SpamTrainMailDatum): Promise<void> {
 		const { query, params } = sql`
-            INSERT
-            OR REPLACE INTO spam_classification_training_data(listId, elementId, ownerGroup, subject, body, isSpam, lastModified, isSpamConfidence)
+			INSERT
+			OR REPLACE INTO spam_classification_training_data(listId, elementId, ownerGroup, subject, body, isSpam, 
+            lastModified, isSpamConfidence, sender, toRecipients, ccRecipients, bccRecipients, authStatus)
 				VALUES (
-            ${listIdPart(spamTrainMailDatum.mailId)},
-            ${elementIdPart(spamTrainMailDatum.mailId)},
-            ${spamTrainMailDatum.ownerGroup},
-            ${spamTrainMailDatum.subject},
-            ${spamTrainMailDatum.body},
-            ${spamTrainMailDatum.isSpam ? 1 : 0},
-            ${Date.now()},
-            ${spamTrainMailDatum.isSpamConfidence}
-            )`
+			${listIdPart(spamTrainMailDatum.mailId)},
+			${elementIdPart(spamTrainMailDatum.mailId)},
+			${spamTrainMailDatum.ownerGroup},
+			${spamTrainMailDatum.subject},
+			${spamTrainMailDatum.body},
+			${spamTrainMailDatum.isSpam ? 1 : 0},
+			${Date.now()},
+			${spamTrainMailDatum.isSpamConfidence},
+			${spamTrainMailDatum.sender},
+			${spamTrainMailDatum.toRecipients},
+			${spamTrainMailDatum.ccRecipients},
+			${spamTrainMailDatum.bccRecipients},
+			${spamTrainMailDatum.authStatus}
+			)`
 		await this.sqlCipherFacade.run(query, params)
 	}

@ -250,11 +258,21 @@ export class OfflineStoragePersistence {
 	}

 	async getCertainSpamClassificationTrainingDataAfterCutoff(cutoffTimestamp: number, ownerGroupId: Id): Promise<SpamTrainMailDatum[]> {
-		const { query, params } = sql`SELECT listId, elementId, subject, body, isSpam, isSpamConfidence
-                                    FROM spam_classification_training_data
-                                    WHERE lastModified > ${cutoffTimestamp}
-                                      AND isSpamConfidence > 0
-                                      AND ownerGroup = ${ownerGroupId}`
+		const { query, params } = sql`SELECT listId,
+											 elementId,
+											 subject,
+											 body,
+											 isSpam,
+											 isSpamConfidence,
+											 sender,
+											 toRecipients,
+											 ccRecipients,
+											 bccRecipients,
+											 authStatus
+									  FROM spam_classification_training_data
+									  WHERE lastModified > ${cutoffTimestamp}
+										AND isSpamConfidence > 0
+										AND ownerGroup = ${ownerGroupId}`
 		const resultRows = await this.sqlCipherFacade.all(query, params)
 		return resultRows.map(untagSqlObject).map((row) => row as unknown as SpamTrainMailDatum)
 	}
--- a/src/mail-app/workerUtils/spamClassification/PreprocessPatterns.ts
+++ b/src/mail-app/workerUtils/spamClassification/PreprocessPatterns.ts
@ -9,30 +9,30 @@ export const ML_DATE_REGEX = [
 	/\b(?<!-)\d{4}(?:-\d{1,2}){2}(?!-)\b/g, // 2023-12-01 | 2023-12-1
 ]

-export const ML_DATE_TOKEN = " <DATE> "
+export const ML_DATE_TOKEN = " TDATE "

 export const ML_URL_REGEX = /(?:http|https|ftp|sftp):\/\/([\w.-]+)(?:\/[^\s]*)?/g

-export const ML_URL_TOKEN = " <URL-$1> "
+export const ML_URL_TOKEN = " TURL$1 "

 export const ML_EMAIL_ADDR_REGEX = /(?:mailto:)?[A-Za-z0-9_+\-.]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g
-export const ML_EMAIL_ADDR_TOKEN = " <EMAIL> "
+export const ML_EMAIL_ADDR_TOKEN = " TEMAIL "

 export const ML_BITCOIN_REGEX = /\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b/g

-export const ML_BITCOIN_TOKEN = " <BITCOIN> "
+export const ML_BITCOIN_TOKEN = " TBITCOIN "

 export const ML_CREDIT_CARD_REGEX = /\b(\d{4}\s?){4}\b|\b[0-9]\d{13,16}\b/g

-export const ML_CREDIT_CARD_TOKEN = " <CREDIT-CARD> "
+export const ML_CREDIT_CARD_TOKEN = " TCREDITCARD "

 export const ML_NUMBER_SEQUENCE_REGEX = /\b\d+\b/g

-export const ML_NUMBER_SEQUENCE_TOKEN = " <NUMBER> "
+export const ML_NUMBER_SEQUENCE_TOKEN = " TNUMBER "

-export const ML_SPECIAL_CHARACTER_REGEX = /([!@#$%^&*()+`_=\\{}"':;?/,.~]+)(?![^<]*>)|(?!\w)[-]+(?!\w)/g
+export const ML_SPECIAL_CHARACTER_REGEX = /([!@#$%^&*()[\]<>+`_=\\{}"':;?/,-.~]+)/g

-export const ML_SPECIAL_CHARACTER_TOKEN = " <SPECIAL-CHAR> "
+export const ML_SPECIAL_CHARACTER_TOKEN = " TSPECIALCHAR "

 export const ML_SPACE_BEFORE_NEW_LINE_REGEX = /\s+\n/g

--- a/src/mail-app/workerUtils/spamClassification/SpamClassificationInitializer.ts
+++ b/src/mail-app/workerUtils/spamClassification/SpamClassificationInitializer.ts
@ -1,6 +1,16 @@
 import { EntityClient } from "../../../common/api/common/EntityClient"
 import { assertNotNull, isNotNull, lazyAsync } from "@tutao/tutanota-utils"
-import { MailBag, MailboxGroupRootTypeRef, MailBoxTypeRef, MailFolder, MailFolderTypeRef, MailTypeRef } from "../../../common/api/entities/tutanota/TypeRefs"
+import {
+	MailAddress,
+	MailBag,
+	MailboxGroupRootTypeRef,
+	MailBoxTypeRef,
+	MailDetails,
+	MailFolder,
+	MailFolderTypeRef,
+	MailTypeRef,
+	Recipients,
+} from "../../../common/api/entities/tutanota/TypeRefs"
 import { getMailSetKind, getSpamConfidence, MailSetKind } from "../../../common/api/common/TutanotaConstants"
 import { elementIdPart, isSameId, listIdPart, timestampToGeneratedId } from "../../../common/api/common/utils/EntityUtils"
 import { OfflineStoragePersistence } from "../index/OfflineStoragePersistence"
@ -8,6 +18,7 @@ import { getMailBodyText } from "../../../common/api/common/CommonMailUtils"
 import { BulkMailLoader, MailWithMailDetails } from "../index/BulkMailLoader"
 import { hasError } from "../../../common/api/common/utils/ErrorUtils"
 import { SpamTrainMailDatum } from "./SpamClassifier"
+import { extractSpamHeaderFeatures } from "../../mail/model/SpamClassificationHandler"

 const INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS = 28

@ -30,7 +41,6 @@ export class SpamClassificationInitializer {
 		// available in the current mail bag
 		const data = await this.downloadMailAndMailDetailsByGroupMembership(ownerGroup)
 		data.filter((datum) => datum.isSpamConfidence > 0)
-		data.map((datum) => this.offlineStorage.storeSpamClassification(datum))

 		let spamMailsCount = 0
 		let hamMailsCount = 0
@ -53,7 +63,6 @@ export class SpamClassificationInitializer {
 		const mailbox = await this.entityClient.load(MailBoxTypeRef, mailboxGroupRoot.mailbox)
 		const mailSets = await this.entityClient.loadAll(MailFolderTypeRef, assertNotNull(mailbox.folders).folders)
 		const spamFolder = mailSets.find((s) => getMailSetKind(s) === MailSetKind.SPAM)!
-		const inboxFolder = mailSets.find((s) => getMailSetKind(s) === MailSetKind.INBOX)!

 		const downloadedMailClassificationDatas = new Array<SpamTrainMailDatum>()
 		const allMailbags = [assertNotNull(mailbox.currentMailBag), ...mailbox.archivedMailBags].reverse() // sorted from latest to oldest
@ -63,14 +72,14 @@ export class SpamClassificationInitializer {
 			isNotNull(currentMailbag) && downloadedMailClassificationDatas.length < this.MIN_MAILS_COUNT;
 			currentMailbag = allMailbags.pop()
 		) {
-			const mailsOfThisMailbag = await this.downloadMailAndMailDetailsByMailbag(currentMailbag, spamFolder, inboxFolder)
+			const mailsOfThisMailbag = await this.downloadMailAndMailDetailsByMailbag(currentMailbag, spamFolder)
 			downloadedMailClassificationDatas.push(...mailsOfThisMailbag)
 		}

 		return downloadedMailClassificationDatas
 	}

-	private async downloadMailAndMailDetailsByMailbag(mailbag: MailBag, spamFolder: MailFolder, inboxFolder: MailFolder): Promise<Array<SpamTrainMailDatum>> {
+	private async downloadMailAndMailDetailsByMailbag(mailbag: MailBag, spamFolder: MailFolder): Promise<Array<SpamTrainMailDatum>> {
 		const { LocalTimeDateProvider } = await import("../../../common/api/worker/DateProvider.js")
 		const dateProvider = new LocalTimeDateProvider()
 		const startTime = dateProvider.getStartOfDayShiftedBy(this.TIME_LIMIT).getTime()
@ -84,11 +93,12 @@ export class SpamClassificationInitializer {
 			// Download mail details
 			.then((mails) => bulkMailLoader.loadMailDetails(mails))
 			// Map to spam mail datum
-			.then((mails) => mails.map((m) => this.mailWithDetailsToMailDatum(spamFolder, inboxFolder, m)))
+			.then((mails) => mails.map((m) => this.mailWithDetailsToMailDatum(spamFolder, m)))
 	}

-	private mailWithDetailsToMailDatum(spamFolder: MailFolder, inboxFolder: MailFolder, { mail, mailDetails }: MailWithMailDetails): SpamTrainMailDatum {
+	private mailWithDetailsToMailDatum(spamFolder: MailFolder, { mail, mailDetails }: MailWithMailDetails): SpamTrainMailDatum {
 		const isSpam = mail.sets.some((folderId) => isSameId(folderId, spamFolder._id))
+
 		return {
 			mailId: mail._id,
 			subject: mail.subject,
@ -98,6 +108,7 @@ export class SpamClassificationInitializer {
 			listId: listIdPart(mail._id),
 			elementId: elementIdPart(mail._id),
 			ownerGroup: assertNotNull(mail._ownerGroup),
+			...extractSpamHeaderFeatures(mail, mailDetails),
 		} as SpamTrainMailDatum
 	}
 }
--- a/src/mail-app/workerUtils/spamClassification/SpamClassifier.ts
+++ b/src/mail-app/workerUtils/spamClassification/SpamClassifier.ts
@ -54,12 +54,22 @@ export type SpamTrainMailDatum = {
 	isSpam: boolean
 	isSpamConfidence: number
 	ownerGroup: Id
+	sender: string
+	toRecipients: string
+	ccRecipients: string
+	bccRecipients: string
+	authStatus: string
 }

 export type SpamPredMailDatum = {
 	subject: string
 	body: string
 	ownerGroup: Id
+	sender: string
+	toRecipients: string
+	ccRecipients: string
+	bccRecipients: string
+	authStatus: string
 }

 const PREDICTION_THRESHOLD = 0.55
@ -223,9 +233,16 @@ export class SpamClassifier {
 			preprocessedMail = preprocessedMail.replaceAll(ML_SPACE_BEFORE_NEW_LINE_REGEX, ML_SPACE_BEFORE_NEW_LINE_TOKEN)
 		}

+		preprocessedMail += this.getHeaderFeatures(mail)
+
 		return preprocessedMail
 	}

+	private getHeaderFeatures(mail: SpamTrainMailDatum | SpamPredMailDatum): string {
+		const { sender, toRecipients, ccRecipients, bccRecipients, authStatus } = mail
+		return `\n${sender}\n${toRecipients}\n${ccRecipients}\n${bccRecipients}\n${authStatus}`
+	}
+
 	public async initialTraining(mails: SpamTrainMailDatum[]): Promise<TrainingPerformance> {
 		const preprocessingStart = performance.now()
 		const tokenizedMails = await promiseMap(mails, (mail) => spamClassifierTokenizer(this.preprocessMail(mail)))
@ -497,7 +514,8 @@ export class SpamClassifier {
 	private concatSubjectAndBody(mail: SpamTrainMailDatum | SpamPredMailDatum) {
 		const subject = mail.subject || ""
 		const body = mail.body || ""
-		const concatenated = `${subject} ${body}`.trim()
+		const concatenated = `${subject}\n${body}`.trim()
+
 		return concatenated.length > 0 ? concatenated : " "
 	}

--- a/test/tests/api/worker/utils/spamClassification/PreprocessPatternsTest.ts
+++ b/test/tests/api/worker/utils/spamClassification/PreprocessPatternsTest.ts
@ -130,11 +130,11 @@ o.spec("PreprocessPatterns", () => {
 	o.spec("Url patterns", () => {
 		o.test("All recognized url patterns", async () => {
 			const urlsMap = new Map([
-				["https://tuta.com", "<URL-tuta.com>"],
-				["https://microsoft.com/outlook/test", "<URL-microsoft.com>"],
-				["https://subdomain.microsoft.com/outlook/test", "<URL-subdomain.microsoft.com>"],
-				["https://subdomain.spam.com/this/is/not/cool/dsfalkfjd2309jlk234oi2k", "<URL-subdomain.spam.com>"],
-				["https://subdomain.test.de/spam!", "<URL-subdomain.test.de>"],
+				["https://tuta.com", "TURLtuta.com"],
+				["https://microsoft.com/outlook/test", "TURLmicrosoft.com"],
+				["https://subdomain.microsoft.com/outlook/test", "TURLsubdomain.microsoft.com"],
+				["https://subdomain.spam.com/this/is/not/cool/dsfalkfjd2309jlk234oi2k", "TURLsubdomain.spam.com"],
+				["https://subdomain.test.de/spam!", "TURLsubdomain.test.de"],
 			])

 			for (const [domain, expectedToken] of urlsMap.entries()) {
@ -297,6 +297,8 @@ o.spec("PreprocessPatterns", () => {
 				["*", ML_SPECIAL_CHARACTER_TOKEN],
 				["(", ML_SPECIAL_CHARACTER_TOKEN],
 				[")", ML_SPECIAL_CHARACTER_TOKEN],
+				["<", ML_SPECIAL_CHARACTER_TOKEN],
+				[">", ML_SPECIAL_CHARACTER_TOKEN],
 				["+", ML_SPECIAL_CHARACTER_TOKEN],
 				["`", ML_SPECIAL_CHARACTER_TOKEN],
 				["_", ML_SPECIAL_CHARACTER_TOKEN],
@ -318,6 +320,7 @@ o.spec("PreprocessPatterns", () => {
 				["--", ML_SPECIAL_CHARACTER_TOKEN],
 				["---", ML_SPECIAL_CHARACTER_TOKEN],
 				["--- ---", `${ML_SPECIAL_CHARACTER_TOKEN} ${ML_SPECIAL_CHARACTER_TOKEN}`],
+				["[ ]", `${ML_SPECIAL_CHARACTER_TOKEN} ${ML_SPECIAL_CHARACTER_TOKEN}`],
 			])

 			for (const [specialCharSequence, expectedResult] of specialCharsMap) {
@ -327,7 +330,7 @@ o.spec("PreprocessPatterns", () => {
 		})

 		o.test("Not recognized special-character-like patterns", async () => {
-			const notSpecialChars = ["[", "]", "<", ">", "test-test"]
+			const notSpecialChars = ["§", "€"]

 			const notSpecialCharsText = notSpecialChars.join("\n")
 			let resultNotSpecialCharsText = notSpecialCharsText
--- a/test/tests/api/worker/utils/spamClassification/SpamClassifierTest.ts
+++ b/test/tests/api/worker/utils/spamClassification/SpamClassifierTest.ts
@ -4,7 +4,6 @@ import { parseCsv } from "../../../../../../src/common/misc/parsing/CsvParser"
 import {
 	DEFAULT_PREPROCESS_CONFIGURATION,
 	SpamClassifier,
-	spamClassifierTokenizer as testTokenize,
 	SpamTrainMailDatum,
 } from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassifier"
 import { OfflineStoragePersistence } from "../../../../../../src/mail-app/workerUtils/index/OfflineStoragePersistence"
@ -36,6 +35,11 @@ export async function readMailDataFromCSV(filePath: string): Promise<{
 		const subject = row[8]
 		const body = row[10]
 		const label = row[11]
+		const from = row[0]
+		const to = row[1]
+		const cc = row[2]
+		const bcc = row[3]
+		const authStatus = row[4]

 		let isSpam = label === "spam" ? true : label === "ham" ? false : null
 		isSpam = assertNotNull(isSpam, "Unknown label detected: " + label)
@ -47,6 +51,11 @@ export async function readMailDataFromCSV(filePath: string): Promise<{
 			isSpam,
 			isSpamConfidence: 1,
 			ownerGroup: "owner",
+			sender: from,
+			toRecipients: to,
+			ccRecipients: cc,
+			bccRecipients: bcc,
+			authStatus: authStatus,
 		} as SpamTrainMailDatum)
 	}

@ -99,6 +108,11 @@ o.spec("SpamClassifierTest", () => {
 			isSpam: true,
 			isSpamConfidence: 1,
 			ownerGroup: "owner",
+			sender: "",
+			toRecipients: "",
+			ccRecipients: "",
+			bccRecipients: "",
+			authStatus: "",
 		}
 		const layersModel = object<Sequential>()
 		spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, false)
@ -119,6 +133,11 @@ o.spec("SpamClassifierTest", () => {
 			isSpam: false,
 			isSpamConfidence: 0,
 			ownerGroup: "owner",
+			sender: "",
+			toRecipients: "",
+			ccRecipients: "",
+			bccRecipients: "",
+			authStatus: "",
 		}

 		const layersModel = object<Sequential>()
@ -165,6 +184,11 @@ o.spec("SpamClassifierTest", () => {
 		const classifier = new SpamClassifier(object(), object(), object())
 		const mail = {
 			subject: `Sample Tokens and values`,
+			sender: "sender",
+			toRecipients: "toRecipients",
+			ccRecipients: "ccRecipients",
+			bccRecipients: "bccRecipients",
+			authStatus: "authStatus",
 			// prettier-ignore
 			body: `Hello, these are my MAC Address
 				FB-94-77-45-96-74
@ -228,8 +252,8 @@ o.spec("SpamClassifierTest", () => {
 				Special Characters
 				!
 				@
-				Not Special Characters
-				]
+				Not Special Character
+				§
 				Number Sequences:
 				26098375
 				IBAN: DE91 1002 0370 0320 2239 82
@ -252,84 +276,90 @@ this text is shown
 		} as SpamTrainMailDatum
 		const preprocessedMail = classifier.preprocessMail(mail)
 		// prettier-ignore
-		const expectedOutput = `Sample Tokens and values Hello <SPECIAL-CHAR>  these are my MAC Address
-\t\t\t\tFB <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER> -D5 <SPECIAL-CHAR>  <NUMBER> -7C
-\t\t\t\tB4 <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER> -2A-DE-D4
+		const expectedOutput = `Sample Tokens and values
+Hello TSPECIALCHAR  these are my MAC Address
+\t\t\t\tFB TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR D5 TSPECIALCHAR  TNUMBER  TSPECIALCHAR 7C
+\t\t\t\tB4 TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR 2A TSPECIALCHAR DE TSPECIALCHAR D4
 \t\t\t\talong with my ISBNs
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\t <NUMBER> -X
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR X
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER
 \t\t\t\tSSN
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
 \t\t\t\tSHAs
 \t\t\t\t585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
 \t\t\t\t7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
 \t\t\t\t769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
 \t\t\t\tPhone Numbers
-\t\t\t\t <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>   <NUMBER>
-\t\t\t\t <SPECIAL-CHAR>  <NUMBER>   <NUMBER>   <NUMBER>   <NUMBER>
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\tVIN  <SPECIAL-CHAR> Vehicle identification number <SPECIAL-CHAR>
+\t\t\t\t TSPECIALCHAR  TNUMBER  TSPECIALCHAR   TNUMBER
+\t\t\t\t TSPECIALCHAR  TNUMBER   TNUMBER   TNUMBER   TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\tVIN  TSPECIALCHAR Vehicle identification number TSPECIALCHAR
 \t\t\t\t3FADP4AJ3BM438397
 \t\t\t\tWAULT64B82N564937
 \t\t\t\tGUIDs
-\t\t\t\t781a9631 <SPECIAL-CHAR>  <NUMBER> -4f9c-bb36-25c3364b754b
-\t\t\t\t325783d4-a64e-453b-85e6-ed4b2cd4c9bf
+\t\t\t\t781a9631 TSPECIALCHAR  TNUMBER  TSPECIALCHAR 4f9c TSPECIALCHAR bb36 TSPECIALCHAR 25c3364b754b
+\t\t\t\t325783d4 TSPECIALCHAR a64e TSPECIALCHAR 453b TSPECIALCHAR 85e6 TSPECIALCHAR ed4b2cd4c9bf
 \t\t\t\tHex Colors
-\t\t\t\t <SPECIAL-CHAR> 2016c1
-\t\t\t\t <SPECIAL-CHAR> c090a4
-\t\t\t\t <SPECIAL-CHAR> c855f5
-\t\t\t\t <SPECIAL-CHAR>  <NUMBER>
+\t\t\t\t TSPECIALCHAR 2016c1
+\t\t\t\t TSPECIALCHAR c090a4
+\t\t\t\t TSPECIALCHAR c855f5
+\t\t\t\t TSPECIALCHAR  TNUMBER
 \t\t\t\tIPV4
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
-\t\t\t\tOn Date <SPECIAL-CHAR>
-\t\t\t\t <DATE>
-\t\t\t\t <DATE>
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
+\t\t\t\tOn Date TSPECIALCHAR
+\t\t\t\t TDATE
+\t\t\t\t TDATE
 \t\t\t\tNot Date
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>  <NUMBER>
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  TSPECIALCHAR  TNUMBER
 \t\t\t\tURL
-\t\t\t\t <URL-tuta.com>
-\t\t\t\t <URL-subdomain.microsoft.com>
+\t\t\t\t TURLtuta TSPECIALCHAR com
+\t\t\t\t TURLsubdomain TSPECIALCHAR microsoft TSPECIALCHAR com
 \t\t\t\tNOT URL
-\t\t\t\t <URL-tuta>
+\t\t\t\t TURLtuta
 \t\t\t\tMAIL
-\t\t\t\t <EMAIL>
-\t\t\t\t <EMAIL>
+\t\t\t\t TEMAIL
+\t\t\t\t TEMAIL
 \t\t\t\tCredit Card
-\t\t\t\t <CREDIT-CARD>
-\t\t\t\t <CREDIT-CARD>
+\t\t\t\t TCREDITCARD
+\t\t\t\t TCREDITCARD
 \t\t\t\tNot Credit Card
-\t\t\t\t <NUMBER>   <NUMBER>
+\t\t\t\t TNUMBER   TNUMBER
 \t\t\t\tBit Coin Address
-\t\t\t\t <BITCOIN>
-\t\t\t\t <BITCOIN>
+\t\t\t\t TBITCOIN
+\t\t\t\t TBITCOIN
 \t\t\t\tNot BTC
 \t\t\t\t5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
 \t\t\t\t1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
 \t\t\t\tSpecial Characters
-\t\t\t\t <SPECIAL-CHAR>
-\t\t\t\t <SPECIAL-CHAR>
-\t\t\t\tNot Special Characters
-\t\t\t\t]
-\t\t\t\tNumber Sequences <SPECIAL-CHAR>
-\t\t\t\t <NUMBER>
-\t\t\t\tIBAN <SPECIAL-CHAR>  DE91  <CREDIT-CARD>  <NUMBER>
+\t\t\t\t TSPECIALCHAR
+\t\t\t\t TSPECIALCHAR
+\t\t\t\tNot Special Character
+\t\t\t\t§
+\t\t\t\tNumber Sequences TSPECIALCHAR
+\t\t\t\t TNUMBER
+\t\t\t\tIBAN TSPECIALCHAR  DE91  TCREDITCARD  TNUMBER
 \t\t\t\tNot Number Sequences
 \t\t\t\tSHLT116
-\t\t\t\tgb <SPECIAL-CHAR> 67ca4b
+\t\t\t\tgb TSPECIALCHAR 67ca4b
 \t\t\t\tOther values found in mails
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  €  <NUMBER>  m  <NUMBER>  Zi  <NUMBER>  <SPECIAL-CHAR>
-\t\t\t\tFax  <SPECIAL-CHAR>  <NUMBER>  <SPECIAL-CHAR>   <NUMBER>   <NUMBER>   <NUMBER>   <NUMBER>
-\t\t\t\tAugust  <NUMBER>  <SPECIAL-CHAR>   <NUMBER>
-\t\t\t\t <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  PM  <SPECIAL-CHAR>   <NUMBER>  <SPECIAL-CHAR>  <NUMBER>  PM
-\t\t\t\tand all text on other lines it seems <SPECIAL-CHAR>
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  €  TNUMBER  m  TNUMBER  Zi  TNUMBER  TSPECIALCHAR
+\t\t\t\tFax  TSPECIALCHAR  TNUMBER  TSPECIALCHAR   TNUMBER   TNUMBER   TNUMBER   TNUMBER
+\t\t\t\tAugust  TNUMBER  TSPECIALCHAR   TNUMBER
+\t\t\t\t TNUMBER  TSPECIALCHAR  TNUMBER  PM  TSPECIALCHAR   TNUMBER  TSPECIALCHAR  TNUMBER  PM
+\t\t\t\tand all text on other lines it seems TSPECIALCHAR
 Button Text
-this text is shown`
+this text is shown
+sender
+toRecipients
+ccRecipients
+bccRecipients
+authStatus`
 		o.check(preprocessedMail).equals(expectedOutput)
 	})

@ -357,8 +387,24 @@ this text is shown`
 		await spamClassifier.initialize("firstGroup")
 		await spamClassifier.initialize("secondGroup")

-		const isSpamFirstMail = await spamClassifier.predict({ subject: "", body: "", ownerGroup: "firstGroup" })
-		const isSpamSecondMail = await spamClassifier.predict({ subject: "", body: "", ownerGroup: "secondGroup" })
+		const commonSpamFields = {
+			subject: "",
+			body: "",
+			sender: "string",
+			toRecipients: "string",
+			ccRecipients: "string",
+			bccRecipients: "string",
+			authStatus: "",
+		}
+
+		const isSpamFirstMail = await spamClassifier.predict({
+			ownerGroup: "firstGroup",
+			...commonSpamFields,
+		})
+		const isSpamSecondMail = await spamClassifier.predict({
+			ownerGroup: "secondGroup",
+			...commonSpamFields,
+		})

 		o(isSpamFirstMail).equals(true)
 		o(isSpamSecondMail).equals(false)
@ -434,48 +480,6 @@ if (DO_RUN_PERFORMANCE_ANALYSIS) {
 				let retrainCount = 0
 				let predictedSpam = false
 				while (!predictedSpam && retrainCount++ <= 3) {
-					// await copiedClassifier.updateModel([{ ...sample, isSpam: false }])
-
-					/*
-    isSpamConfidence: 2
-                    [
-      3, 2, 1, 3, 1,
-      1, 3, 2, 1, 5
-    ] = 22
-    isSpamConfidence: 3
-    [
-      2, 5, 1, 2, 1,
-      1, 1, 2, 1, 2
-    ] = 18
-
-    isSpamConfidence: 4
-    [
-      1, 1, 1, 2, 5,
-      1, 1, 1, 1, 5
-    ] = 19
-    Retraining finished. Took: 477ms
-    Retraining finished. Took: 1259ms
-    predicted new mail to be with probability 0.46 spam
-    Retraining finished. Took: 560ms
-    Retraining finished. Took: 1273ms
-
-    isSpamConfidence: 8
-    Retraining finished. Took: 486ms
-    Retraining finished. Took: 2289ms
-    predicted new mail to be with probability 0.82 spam
-    Retraining finished. Took: 580ms
-    Retraining finished. Took: 2356ms
-    predicted new mail to be with probability 1.00 spam
-    Retraining finished. Took: 556ms
-    Retraining finished. Took: 2357ms
-    predicted new mail to be with probability 0.52 spam
-    [
-      1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1
-    ]
-
-
-                     */
 					await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true, isSpamConfidence: 1 }])
 					predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
 				}
--- a/test/tests/api/worker/utils/spamClassification/spam_classification_test_mails.csv
+++ b/test/tests/api/worker/utils/spamClassification/spam_classification_test_mails.csv
--- a/test/tests/mail/MailModelTest.ts
+++ b/test/tests/mail/MailModelTest.ts
@ -6,12 +6,14 @@ import {
 	BodyTypeRef,
 	ClientSpamClassifierResultTypeRef,
 	Mail,
+	MailAddressTypeRef,
 	MailDetails,
 	MailDetailsBlob,
 	MailDetailsBlobTypeRef,
 	MailDetailsTypeRef,
 	MailFolderTypeRef,
 	MailTypeRef,
+	RecipientsTypeRef,
 } from "../../../src/common/api/entities/tutanota/TypeRefs.js"
 import { EntityClient } from "../../../src/common/api/common/EntityClient.js"
 import { EntityRestClientMock } from "../api/worker/rest/EntityRestClientMock.js"
@ -140,6 +142,14 @@ o.spec("MailModelTest", function () {
 			mailDetails = createTestEntity(MailDetailsTypeRef, {
 				_id: "mailDetail",
 				body: createTestEntity(BodyTypeRef, { text: "some text" }),
+				recipients: createTestEntity(RecipientsTypeRef, {
+					toRecipients: [
+						createTestEntity(MailAddressTypeRef, {
+							name: "Recipient",
+							address: "recipient@tuta.com",
+						}),
+					],
+				}),
 			})
 			mail = createTestEntity(MailTypeRef, {
 				_id: ["mailListId", "mailId"],
@ -147,7 +157,9 @@ o.spec("MailModelTest", function () {
 				mailDetails: ["detailsList", mailDetails._id],
 				subject: "subject",
 				sets: [inboxFolder._id],
+				sender: createTestEntity(MailAddressTypeRef, { name: "Sender", address: "sender@tuta.com" }),
 				processingState: ProcessingState.INBOX_RULE_NOT_PROCESSED,
+				authStatus: "0",
 			})
 			const mailDetailsBlob: MailDetailsBlob = createTestEntity(MailDetailsBlobTypeRef, {
 				_id: mail.mailDetails!,
@ -296,6 +308,11 @@ o.spec("MailModelTest", function () {
 				subject: "subject",
 				isSpam: false,
 				isSpamConfidence: 1,
+				sender: "Sender sender@tuta.com",
+				toRecipients: "Recipient recipient@tuta.com",
+				ccRecipients: "",
+				bccRecipients: "",
+				authStatus: "TAUTHENTICATED",
 			}
 			verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
 			verify(spamClassifier.predict(anything()), { times: 0 })
@ -321,6 +338,11 @@ o.spec("MailModelTest", function () {
 				subject: "subject",
 				isSpam: false,
 				isSpamConfidence: 1,
+				sender: "Sender sender@tuta.com",
+				toRecipients: "Recipient recipient@tuta.com",
+				ccRecipients: "",
+				bccRecipients: "",
+				authStatus: "TAUTHENTICATED",
 			}
 			verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
 			verify(spamClassifier.predict(anything()), { times: 1 })
@ -352,6 +374,11 @@ o.spec("MailModelTest", function () {
 				subject: "subject",
 				isSpam: false,
 				isSpamConfidence: 1,
+				sender: "Sender sender@tuta.com",
+				toRecipients: "Recipient recipient@tuta.com",
+				ccRecipients: "",
+				bccRecipients: "",
+				authStatus: "TAUTHENTICATED",
 			}
 			verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
 			verify(spamClassifier.predict(anything()), { times: 1 })