Include header fields as tokens in the anti-spam

Add the header fields(sender, toRecipients, ccRecipients, bccRecipients,
authStatus) to the anti-spam vectors. We also improve some of the
preprocessing steps and add offline migrations by deleting old spam
tables

Co-authored-by: amm@tutao.de
Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com>
This commit is contained in:
das 2025-10-22 16:18:24 +02:00 committed by abp
parent 21ad4ce2c3
commit f8bbd32695
No known key found for this signature in database
GPG key ID: 791D4EC38A7AA7C2
13 changed files with 10918 additions and 10788 deletions

View file

@ -46,7 +46,7 @@ export const allowedImports = {
contacts: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "mail-view", "date", "date-gui", "mail-editor"],
"calendar-view": ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "date", "date-gui", "sharing", "contacts"],
login: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main"],
"spam-classifier": ["polyfill-helpers", "common", "common-min"],
"spam-classifier": ["polyfill-helpers", "common", "common-min", "main"],
worker: ["polyfill-helpers", "common-min", "common", "native-common", "native-worker", "wasm", "wasm-fallback"],
"pow-worker": [],
settings: [

View file

@ -46,9 +46,6 @@ import { AttributeModel } from "../../common/AttributeModel"
import { TypeModelResolver } from "../../common/EntityFunctions"
import { collapseId, expandId } from "../rest/RestClientIdUtils"
import { Category, syncMetrics } from "../utils/SyncMetrics"
import { hasError } from "../../common/utils/ErrorUtils"
import { SpamClassificationModel, SpamTrainMailDatum } from "../../../../mail-app/workerUtils/spamClassification/SpamClassifier"
import { Mail } from "../../entities/tutanota/TypeRefs"
/**
* this is the value of SQLITE_MAX_VARIABLE_NUMBER in sqlite3.c

View file

@ -7,6 +7,7 @@ import { offline6 } from "./migrations/offline-v6"
import { offline7 } from "./migrations/offline-v7"
import { offline8 } from "./migrations/offline-v8"
import { ProgrammingError } from "../../common/error/ProgrammingError"
import { offline9 } from "./migrations/offline-v9"
export interface OfflineMigration {
readonly version: number
@ -20,11 +21,11 @@ export interface OfflineMigration {
* Normally you should only add them to the end of the list but with offline ones it can be a bit tricky since they change the db structure itself so sometimes
* they should rather be in the beginning.
*/
export const OFFLINE_STORAGE_MIGRATIONS: ReadonlyArray<OfflineMigration> = [offline5, offline6, offline7, offline8]
export const OFFLINE_STORAGE_MIGRATIONS: ReadonlyArray<OfflineMigration> = [offline5, offline6, offline7, offline8, offline9]
// in cases where the actual migration is not there anymore (we clean up old migrations no client would apply anymore)
// and we create a new offline database, we still need to set the offline version to the current value.
export const CURRENT_OFFLINE_VERSION = 8
export const CURRENT_OFFLINE_VERSION = 9
/**
* Migrator for the offline storage between different versions of model. It is tightly couples to the versions of API entities: every time we make an

View file

@ -0,0 +1,12 @@
import { OfflineMigration } from "../OfflineStorageMigrator.js"
import { OfflineStorage } from "../OfflineStorage.js"
import { SqlCipherFacade } from "../../../../native/common/generatedipc/SqlCipherFacade"
export const offline9: OfflineMigration = {
version: 9,
async migrate(storage: OfflineStorage, sqlCipherFacade: SqlCipherFacade) {
console.log("dropping spam_classification_training_data and spam_classification_model, due to new fields")
await sqlCipherFacade.run(`DROP TABLE IF EXISTS spam_classification_training_data`, [])
await sqlCipherFacade.run(`DROP TABLE IF EXISTS spam_classification_model`, [])
},
}

View file

@ -1,13 +1,14 @@
import { createMoveMailData, Mail, MailDetails, MailFolder, MoveMailData } from "../../../common/api/entities/tutanota/TypeRefs"
import { createMoveMailData, Mail, MailAddress, MailDetails, MailFolder, MoveMailData } from "../../../common/api/entities/tutanota/TypeRefs"
import {
DEFAULT_IS_SPAM,
DEFAULT_IS_SPAM_CONFIDENCE,
getSpamConfidence,
MailAuthenticationStatus,
MailSetKind,
ProcessingState,
SpamDecision,
} from "../../../common/api/common/TutanotaConstants"
import type { SpamClassifier, SpamPredMailDatum, SpamTrainMailDatum } from "../../workerUtils/spamClassification/SpamClassifier"
import { SpamClassifier, SpamPredMailDatum, SpamTrainMailDatum } from "../../workerUtils/spamClassification/SpamClassifier"
import { getMailBodyText } from "../../../common/api/common/CommonMailUtils"
import { assertNotNull, debounce, isNotNull, Nullable, ofClass } from "@tutao/tutanota-utils"
import { MailFacade } from "../../../common/api/worker/facades/lazy/MailFacade"
@ -75,6 +76,7 @@ export class SpamClassificationHandler {
subject: mail.subject,
body: getMailBodyText(mailDetails.body),
ownerGroup: assertNotNull(mail._ownerGroup),
...extractSpamHeaderFeatures(mail, mailDetails),
}
const isSpam = (await this.spamClassifier?.predict(spamPredMailDatum)) ?? null
@ -141,7 +143,44 @@ export class SpamClassificationHandler {
isSpam: DEFAULT_IS_SPAM,
isSpamConfidence: DEFAULT_IS_SPAM_CONFIDENCE,
ownerGroup: assertNotNull(mail._ownerGroup),
...extractSpamHeaderFeatures(mail, mailDetails),
}
await this.spamClassifier?.storeSpamClassification(spamTrainMailDatum)
}
}
export function extractSpamHeaderFeatures(mail: Mail, mailDetails: MailDetails) {
const sender = joinNamesAndMailAddresses([mail?.sender])
const { toRecipients, ccRecipients, bccRecipients } = extractRecipients(mailDetails)
const authStatus = convertAuthStatusToSpamCategorizationToken(mail.authStatus)
return { sender, toRecipients, ccRecipients, bccRecipients, authStatus }
}
function extractRecipients({ recipients }: MailDetails) {
const toRecipients = joinNamesAndMailAddresses(recipients?.toRecipients)
const ccRecipients = joinNamesAndMailAddresses(recipients?.ccRecipients)
const bccRecipients = joinNamesAndMailAddresses(recipients?.bccRecipients)
return { toRecipients, ccRecipients, bccRecipients }
}
function joinNamesAndMailAddresses(recipients: MailAddress[] | null) {
return recipients?.map((recipient) => `${recipient?.name} ${recipient?.address}`).join(" ") || ""
}
function convertAuthStatusToSpamCategorizationToken(authStatus: string | null): string {
if (authStatus === MailAuthenticationStatus.AUTHENTICATED) {
return "TAUTHENTICATED"
} else if (authStatus === MailAuthenticationStatus.HARD_FAIL) {
return "THARDFAIL"
} else if (authStatus === MailAuthenticationStatus.SOFT_FAIL) {
return "TSOFTFAIL"
} else if (authStatus === MailAuthenticationStatus.INVALID_MAIL_FROM) {
return "TINVALIDMAILFROM"
} else if (authStatus === MailAuthenticationStatus.MISSING_MAIL_FROM) {
return "TMISSINGMAILFROM"
}
return ""
}

View file

@ -67,12 +67,14 @@ export const SearchTableDefinitions: Record<string, OfflineStorageTable> = Objec
})
export const SpamClassificationDefinitions: Record<string, OfflineStorageTable> = Object.freeze({
// Spam classification training data
spam_classification_training_data: {
definition:
"CREATE TABLE IF NOT EXISTS spam_classification_training_data (listId TEXT NOT NULL, elementId TEXT NOT NULL," +
" ownerGroup TEXT NOT NULL, subject TEXT NOT NULL, body TEXT NOT NULL, isSpam NUMBER, " +
"lastModified NUMBER NOT NULL, isSpamConfidence NUMBER NOT NULL, PRIMARY KEY (listId, elementId))",
"ownerGroup TEXT NOT NULL, subject TEXT NOT NULL, body TEXT NOT NULL, isSpam NUMBER," +
"lastModified NUMBER NOT NULL, isSpamConfidence NUMBER NOT NULL, sender TEXT NOT NULL," +
"toRecipients TEXT NOT NULL, ccRecipients TEXT NOT NULL, bccRecipients TEXT NOT NULL," +
"authStatus TEXT NOT NULL, PRIMARY KEY (listId, elementId))",
purgedWithCache: true,
},
@ -187,18 +189,24 @@ export class OfflineStoragePersistence {
async storeSpamClassification(spamTrainMailDatum: SpamTrainMailDatum): Promise<void> {
const { query, params } = sql`
INSERT
OR REPLACE INTO spam_classification_training_data(listId, elementId, ownerGroup, subject, body, isSpam, lastModified, isSpamConfidence)
INSERT
OR REPLACE INTO spam_classification_training_data(listId, elementId, ownerGroup, subject, body, isSpam,
lastModified, isSpamConfidence, sender, toRecipients, ccRecipients, bccRecipients, authStatus)
VALUES (
${listIdPart(spamTrainMailDatum.mailId)},
${elementIdPart(spamTrainMailDatum.mailId)},
${spamTrainMailDatum.ownerGroup},
${spamTrainMailDatum.subject},
${spamTrainMailDatum.body},
${spamTrainMailDatum.isSpam ? 1 : 0},
${Date.now()},
${spamTrainMailDatum.isSpamConfidence}
)`
${listIdPart(spamTrainMailDatum.mailId)},
${elementIdPart(spamTrainMailDatum.mailId)},
${spamTrainMailDatum.ownerGroup},
${spamTrainMailDatum.subject},
${spamTrainMailDatum.body},
${spamTrainMailDatum.isSpam ? 1 : 0},
${Date.now()},
${spamTrainMailDatum.isSpamConfidence},
${spamTrainMailDatum.sender},
${spamTrainMailDatum.toRecipients},
${spamTrainMailDatum.ccRecipients},
${spamTrainMailDatum.bccRecipients},
${spamTrainMailDatum.authStatus}
)`
await this.sqlCipherFacade.run(query, params)
}
@ -250,11 +258,21 @@ export class OfflineStoragePersistence {
}
async getCertainSpamClassificationTrainingDataAfterCutoff(cutoffTimestamp: number, ownerGroupId: Id): Promise<SpamTrainMailDatum[]> {
const { query, params } = sql`SELECT listId, elementId, subject, body, isSpam, isSpamConfidence
FROM spam_classification_training_data
WHERE lastModified > ${cutoffTimestamp}
AND isSpamConfidence > 0
AND ownerGroup = ${ownerGroupId}`
const { query, params } = sql`SELECT listId,
elementId,
subject,
body,
isSpam,
isSpamConfidence,
sender,
toRecipients,
ccRecipients,
bccRecipients,
authStatus
FROM spam_classification_training_data
WHERE lastModified > ${cutoffTimestamp}
AND isSpamConfidence > 0
AND ownerGroup = ${ownerGroupId}`
const resultRows = await this.sqlCipherFacade.all(query, params)
return resultRows.map(untagSqlObject).map((row) => row as unknown as SpamTrainMailDatum)
}

View file

@ -9,30 +9,30 @@ export const ML_DATE_REGEX = [
/\b(?<!-)\d{4}(?:-\d{1,2}){2}(?!-)\b/g, // 2023-12-01 | 2023-12-1
]
export const ML_DATE_TOKEN = " <DATE> "
export const ML_DATE_TOKEN = " TDATE "
export const ML_URL_REGEX = /(?:http|https|ftp|sftp):\/\/([\w.-]+)(?:\/[^\s]*)?/g
export const ML_URL_TOKEN = " <URL-$1> "
export const ML_URL_TOKEN = " TURL$1 "
export const ML_EMAIL_ADDR_REGEX = /(?:mailto:)?[A-Za-z0-9_+\-.]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g
export const ML_EMAIL_ADDR_TOKEN = " <EMAIL> "
export const ML_EMAIL_ADDR_TOKEN = " TEMAIL "
export const ML_BITCOIN_REGEX = /\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b/g
export const ML_BITCOIN_TOKEN = " <BITCOIN> "
export const ML_BITCOIN_TOKEN = " TBITCOIN "
export const ML_CREDIT_CARD_REGEX = /\b(\d{4}\s?){4}\b|\b[0-9]\d{13,16}\b/g
export const ML_CREDIT_CARD_TOKEN = " <CREDIT-CARD> "
export const ML_CREDIT_CARD_TOKEN = " TCREDITCARD "
export const ML_NUMBER_SEQUENCE_REGEX = /\b\d+\b/g
export const ML_NUMBER_SEQUENCE_TOKEN = " <NUMBER> "
export const ML_NUMBER_SEQUENCE_TOKEN = " TNUMBER "
export const ML_SPECIAL_CHARACTER_REGEX = /([!@#$%^&*()+`_=\\{}"':;?/,.~]+)(?![^<]*>)|(?!\w)[-]+(?!\w)/g
export const ML_SPECIAL_CHARACTER_REGEX = /([!@#$%^&*()[\]<>+`_=\\{}"':;?/,-.~]+)/g
export const ML_SPECIAL_CHARACTER_TOKEN = " <SPECIAL-CHAR> "
export const ML_SPECIAL_CHARACTER_TOKEN = " TSPECIALCHAR "
export const ML_SPACE_BEFORE_NEW_LINE_REGEX = /\s+\n/g

View file

@ -1,6 +1,16 @@
import { EntityClient } from "../../../common/api/common/EntityClient"
import { assertNotNull, isNotNull, lazyAsync } from "@tutao/tutanota-utils"
import { MailBag, MailboxGroupRootTypeRef, MailBoxTypeRef, MailFolder, MailFolderTypeRef, MailTypeRef } from "../../../common/api/entities/tutanota/TypeRefs"
import {
MailAddress,
MailBag,
MailboxGroupRootTypeRef,
MailBoxTypeRef,
MailDetails,
MailFolder,
MailFolderTypeRef,
MailTypeRef,
Recipients,
} from "../../../common/api/entities/tutanota/TypeRefs"
import { getMailSetKind, getSpamConfidence, MailSetKind } from "../../../common/api/common/TutanotaConstants"
import { elementIdPart, isSameId, listIdPart, timestampToGeneratedId } from "../../../common/api/common/utils/EntityUtils"
import { OfflineStoragePersistence } from "../index/OfflineStoragePersistence"
@ -8,6 +18,7 @@ import { getMailBodyText } from "../../../common/api/common/CommonMailUtils"
import { BulkMailLoader, MailWithMailDetails } from "../index/BulkMailLoader"
import { hasError } from "../../../common/api/common/utils/ErrorUtils"
import { SpamTrainMailDatum } from "./SpamClassifier"
import { extractSpamHeaderFeatures } from "../../mail/model/SpamClassificationHandler"
const INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS = 28
@ -30,7 +41,6 @@ export class SpamClassificationInitializer {
// available in the current mail bag
const data = await this.downloadMailAndMailDetailsByGroupMembership(ownerGroup)
data.filter((datum) => datum.isSpamConfidence > 0)
data.map((datum) => this.offlineStorage.storeSpamClassification(datum))
let spamMailsCount = 0
let hamMailsCount = 0
@ -53,7 +63,6 @@ export class SpamClassificationInitializer {
const mailbox = await this.entityClient.load(MailBoxTypeRef, mailboxGroupRoot.mailbox)
const mailSets = await this.entityClient.loadAll(MailFolderTypeRef, assertNotNull(mailbox.folders).folders)
const spamFolder = mailSets.find((s) => getMailSetKind(s) === MailSetKind.SPAM)!
const inboxFolder = mailSets.find((s) => getMailSetKind(s) === MailSetKind.INBOX)!
const downloadedMailClassificationDatas = new Array<SpamTrainMailDatum>()
const allMailbags = [assertNotNull(mailbox.currentMailBag), ...mailbox.archivedMailBags].reverse() // sorted from latest to oldest
@ -63,14 +72,14 @@ export class SpamClassificationInitializer {
isNotNull(currentMailbag) && downloadedMailClassificationDatas.length < this.MIN_MAILS_COUNT;
currentMailbag = allMailbags.pop()
) {
const mailsOfThisMailbag = await this.downloadMailAndMailDetailsByMailbag(currentMailbag, spamFolder, inboxFolder)
const mailsOfThisMailbag = await this.downloadMailAndMailDetailsByMailbag(currentMailbag, spamFolder)
downloadedMailClassificationDatas.push(...mailsOfThisMailbag)
}
return downloadedMailClassificationDatas
}
private async downloadMailAndMailDetailsByMailbag(mailbag: MailBag, spamFolder: MailFolder, inboxFolder: MailFolder): Promise<Array<SpamTrainMailDatum>> {
private async downloadMailAndMailDetailsByMailbag(mailbag: MailBag, spamFolder: MailFolder): Promise<Array<SpamTrainMailDatum>> {
const { LocalTimeDateProvider } = await import("../../../common/api/worker/DateProvider.js")
const dateProvider = new LocalTimeDateProvider()
const startTime = dateProvider.getStartOfDayShiftedBy(this.TIME_LIMIT).getTime()
@ -84,11 +93,12 @@ export class SpamClassificationInitializer {
// Download mail details
.then((mails) => bulkMailLoader.loadMailDetails(mails))
// Map to spam mail datum
.then((mails) => mails.map((m) => this.mailWithDetailsToMailDatum(spamFolder, inboxFolder, m)))
.then((mails) => mails.map((m) => this.mailWithDetailsToMailDatum(spamFolder, m)))
}
private mailWithDetailsToMailDatum(spamFolder: MailFolder, inboxFolder: MailFolder, { mail, mailDetails }: MailWithMailDetails): SpamTrainMailDatum {
private mailWithDetailsToMailDatum(spamFolder: MailFolder, { mail, mailDetails }: MailWithMailDetails): SpamTrainMailDatum {
const isSpam = mail.sets.some((folderId) => isSameId(folderId, spamFolder._id))
return {
mailId: mail._id,
subject: mail.subject,
@ -98,6 +108,7 @@ export class SpamClassificationInitializer {
listId: listIdPart(mail._id),
elementId: elementIdPart(mail._id),
ownerGroup: assertNotNull(mail._ownerGroup),
...extractSpamHeaderFeatures(mail, mailDetails),
} as SpamTrainMailDatum
}
}

View file

@ -54,12 +54,22 @@ export type SpamTrainMailDatum = {
isSpam: boolean
isSpamConfidence: number
ownerGroup: Id
sender: string
toRecipients: string
ccRecipients: string
bccRecipients: string
authStatus: string
}
export type SpamPredMailDatum = {
subject: string
body: string
ownerGroup: Id
sender: string
toRecipients: string
ccRecipients: string
bccRecipients: string
authStatus: string
}
const PREDICTION_THRESHOLD = 0.55
@ -223,9 +233,16 @@ export class SpamClassifier {
preprocessedMail = preprocessedMail.replaceAll(ML_SPACE_BEFORE_NEW_LINE_REGEX, ML_SPACE_BEFORE_NEW_LINE_TOKEN)
}
preprocessedMail += this.getHeaderFeatures(mail)
return preprocessedMail
}
private getHeaderFeatures(mail: SpamTrainMailDatum | SpamPredMailDatum): string {
const { sender, toRecipients, ccRecipients, bccRecipients, authStatus } = mail
return `\n${sender}\n${toRecipients}\n${ccRecipients}\n${bccRecipients}\n${authStatus}`
}
public async initialTraining(mails: SpamTrainMailDatum[]): Promise<TrainingPerformance> {
const preprocessingStart = performance.now()
const tokenizedMails = await promiseMap(mails, (mail) => spamClassifierTokenizer(this.preprocessMail(mail)))
@ -497,7 +514,8 @@ export class SpamClassifier {
private concatSubjectAndBody(mail: SpamTrainMailDatum | SpamPredMailDatum) {
const subject = mail.subject || ""
const body = mail.body || ""
const concatenated = `${subject} ${body}`.trim()
const concatenated = `${subject}\n${body}`.trim()
return concatenated.length > 0 ? concatenated : " "
}

View file

@ -130,11 +130,11 @@ o.spec("PreprocessPatterns", () => {
o.spec("Url patterns", () => {
o.test("All recognized url patterns", async () => {
const urlsMap = new Map([
["https://tuta.com", "<URL-tuta.com>"],
["https://microsoft.com/outlook/test", "<URL-microsoft.com>"],
["https://subdomain.microsoft.com/outlook/test", "<URL-subdomain.microsoft.com>"],
["https://subdomain.spam.com/this/is/not/cool/dsfalkfjd2309jlk234oi2k", "<URL-subdomain.spam.com>"],
["https://subdomain.test.de/spam!", "<URL-subdomain.test.de>"],
["https://tuta.com", "TURLtuta.com"],
["https://microsoft.com/outlook/test", "TURLmicrosoft.com"],
["https://subdomain.microsoft.com/outlook/test", "TURLsubdomain.microsoft.com"],
["https://subdomain.spam.com/this/is/not/cool/dsfalkfjd2309jlk234oi2k", "TURLsubdomain.spam.com"],
["https://subdomain.test.de/spam!", "TURLsubdomain.test.de"],
])
for (const [domain, expectedToken] of urlsMap.entries()) {
@ -297,6 +297,8 @@ o.spec("PreprocessPatterns", () => {
["*", ML_SPECIAL_CHARACTER_TOKEN],
["(", ML_SPECIAL_CHARACTER_TOKEN],
[")", ML_SPECIAL_CHARACTER_TOKEN],
["<", ML_SPECIAL_CHARACTER_TOKEN],
[">", ML_SPECIAL_CHARACTER_TOKEN],
["+", ML_SPECIAL_CHARACTER_TOKEN],
["`", ML_SPECIAL_CHARACTER_TOKEN],
["_", ML_SPECIAL_CHARACTER_TOKEN],
@ -318,6 +320,7 @@ o.spec("PreprocessPatterns", () => {
["--", ML_SPECIAL_CHARACTER_TOKEN],
["---", ML_SPECIAL_CHARACTER_TOKEN],
["--- ---", `${ML_SPECIAL_CHARACTER_TOKEN} ${ML_SPECIAL_CHARACTER_TOKEN}`],
["[ ]", `${ML_SPECIAL_CHARACTER_TOKEN} ${ML_SPECIAL_CHARACTER_TOKEN}`],
])
for (const [specialCharSequence, expectedResult] of specialCharsMap) {
@ -327,7 +330,7 @@ o.spec("PreprocessPatterns", () => {
})
o.test("Not recognized special-character-like patterns", async () => {
const notSpecialChars = ["[", "]", "<", ">", "test-test"]
const notSpecialChars = ["§", "€"]
const notSpecialCharsText = notSpecialChars.join("\n")
let resultNotSpecialCharsText = notSpecialCharsText

View file

@ -4,7 +4,6 @@ import { parseCsv } from "../../../../../../src/common/misc/parsing/CsvParser"
import {
DEFAULT_PREPROCESS_CONFIGURATION,
SpamClassifier,
spamClassifierTokenizer as testTokenize,
SpamTrainMailDatum,
} from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassifier"
import { OfflineStoragePersistence } from "../../../../../../src/mail-app/workerUtils/index/OfflineStoragePersistence"
@ -36,6 +35,11 @@ export async function readMailDataFromCSV(filePath: string): Promise<{
const subject = row[8]
const body = row[10]
const label = row[11]
const from = row[0]
const to = row[1]
const cc = row[2]
const bcc = row[3]
const authStatus = row[4]
let isSpam = label === "spam" ? true : label === "ham" ? false : null
isSpam = assertNotNull(isSpam, "Unknown label detected: " + label)
@ -47,6 +51,11 @@ export async function readMailDataFromCSV(filePath: string): Promise<{
isSpam,
isSpamConfidence: 1,
ownerGroup: "owner",
sender: from,
toRecipients: to,
ccRecipients: cc,
bccRecipients: bcc,
authStatus: authStatus,
} as SpamTrainMailDatum)
}
@ -99,6 +108,11 @@ o.spec("SpamClassifierTest", () => {
isSpam: true,
isSpamConfidence: 1,
ownerGroup: "owner",
sender: "",
toRecipients: "",
ccRecipients: "",
bccRecipients: "",
authStatus: "",
}
const layersModel = object<Sequential>()
spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, false)
@ -119,6 +133,11 @@ o.spec("SpamClassifierTest", () => {
isSpam: false,
isSpamConfidence: 0,
ownerGroup: "owner",
sender: "",
toRecipients: "",
ccRecipients: "",
bccRecipients: "",
authStatus: "",
}
const layersModel = object<Sequential>()
@ -165,6 +184,11 @@ o.spec("SpamClassifierTest", () => {
const classifier = new SpamClassifier(object(), object(), object())
const mail = {
subject: `Sample Tokens and values`,
sender: "sender",
toRecipients: "toRecipients",
ccRecipients: "ccRecipients",
bccRecipients: "bccRecipients",
authStatus: "authStatus",
// prettier-ignore
body: `Hello, these are my MAC Address
FB-94-77-45-96-74
@ -228,8 +252,8 @@ o.spec("SpamClassifierTest", () => {
Special Characters
!
@
Not Special Characters
]
Not Special Character
§
Number Sequences:
26098375
IBAN: DE91 1002 0370 0320 2239 82
@ -252,84 +276,90 @@ this text is shown
} as SpamTrainMailDatum
const preprocessedMail = classifier.preprocessMail(mail)
// prettier-ignore
const expectedOutput = `Sample Tokens and values Hello <SPECIAL-CHAR> these are my MAC Address
\t\t\t\tFB <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> -D5 <SPECIAL-CHAR> <NUMBER> -7C
\t\t\t\tB4 <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> -2A-DE-D4
const expectedOutput = `Sample Tokens and values
Hello TSPECIALCHAR these are my MAC Address
\t\t\t\tFB TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR D5 TSPECIALCHAR TNUMBER TSPECIALCHAR 7C
\t\t\t\tB4 TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR 2A TSPECIALCHAR DE TSPECIALCHAR D4
\t\t\t\talong with my ISBNs
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <NUMBER> -X
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR X
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\tSSN
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\tSHAs
\t\t\t\t585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
\t\t\t\t7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
\t\t\t\t769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
\t\t\t\tPhone Numbers
\t\t\t\t <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <SPECIAL-CHAR> <NUMBER> <NUMBER> <NUMBER> <NUMBER>
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\tVIN <SPECIAL-CHAR> Vehicle identification number <SPECIAL-CHAR>
\t\t\t\t TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\tVIN TSPECIALCHAR Vehicle identification number TSPECIALCHAR
\t\t\t\t3FADP4AJ3BM438397
\t\t\t\tWAULT64B82N564937
\t\t\t\tGUIDs
\t\t\t\t781a9631 <SPECIAL-CHAR> <NUMBER> -4f9c-bb36-25c3364b754b
\t\t\t\t325783d4-a64e-453b-85e6-ed4b2cd4c9bf
\t\t\t\t781a9631 TSPECIALCHAR TNUMBER TSPECIALCHAR 4f9c TSPECIALCHAR bb36 TSPECIALCHAR 25c3364b754b
\t\t\t\t325783d4 TSPECIALCHAR a64e TSPECIALCHAR 453b TSPECIALCHAR 85e6 TSPECIALCHAR ed4b2cd4c9bf
\t\t\t\tHex Colors
\t\t\t\t <SPECIAL-CHAR> 2016c1
\t\t\t\t <SPECIAL-CHAR> c090a4
\t\t\t\t <SPECIAL-CHAR> c855f5
\t\t\t\t <SPECIAL-CHAR> <NUMBER>
\t\t\t\t TSPECIALCHAR 2016c1
\t\t\t\t TSPECIALCHAR c090a4
\t\t\t\t TSPECIALCHAR c855f5
\t\t\t\t TSPECIALCHAR TNUMBER
\t\t\t\tIPV4
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\tOn Date <SPECIAL-CHAR>
\t\t\t\t <DATE>
\t\t\t\t <DATE>
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\tOn Date TSPECIALCHAR
\t\t\t\t TDATE
\t\t\t\t TDATE
\t\t\t\tNot Date
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\tURL
\t\t\t\t <URL-tuta.com>
\t\t\t\t <URL-subdomain.microsoft.com>
\t\t\t\t TURLtuta TSPECIALCHAR com
\t\t\t\t TURLsubdomain TSPECIALCHAR microsoft TSPECIALCHAR com
\t\t\t\tNOT URL
\t\t\t\t <URL-tuta>
\t\t\t\t TURLtuta
\t\t\t\tMAIL
\t\t\t\t <EMAIL>
\t\t\t\t <EMAIL>
\t\t\t\t TEMAIL
\t\t\t\t TEMAIL
\t\t\t\tCredit Card
\t\t\t\t <CREDIT-CARD>
\t\t\t\t <CREDIT-CARD>
\t\t\t\t TCREDITCARD
\t\t\t\t TCREDITCARD
\t\t\t\tNot Credit Card
\t\t\t\t <NUMBER> <NUMBER>
\t\t\t\t TNUMBER TNUMBER
\t\t\t\tBit Coin Address
\t\t\t\t <BITCOIN>
\t\t\t\t <BITCOIN>
\t\t\t\t TBITCOIN
\t\t\t\t TBITCOIN
\t\t\t\tNot BTC
\t\t\t\t5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
\t\t\t\t1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
\t\t\t\tSpecial Characters
\t\t\t\t <SPECIAL-CHAR>
\t\t\t\t <SPECIAL-CHAR>
\t\t\t\tNot Special Characters
\t\t\t\t]
\t\t\t\tNumber Sequences <SPECIAL-CHAR>
\t\t\t\t <NUMBER>
\t\t\t\tIBAN <SPECIAL-CHAR> DE91 <CREDIT-CARD> <NUMBER>
\t\t\t\t TSPECIALCHAR
\t\t\t\t TSPECIALCHAR
\t\t\t\tNot Special Character
\t\t\t\t§
\t\t\t\tNumber Sequences TSPECIALCHAR
\t\t\t\t TNUMBER
\t\t\t\tIBAN TSPECIALCHAR DE91 TCREDITCARD TNUMBER
\t\t\t\tNot Number Sequences
\t\t\t\tSHLT116
\t\t\t\tgb <SPECIAL-CHAR> 67ca4b
\t\t\t\tgb TSPECIALCHAR 67ca4b
\t\t\t\tOther values found in mails
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <NUMBER> m <NUMBER> Zi <NUMBER> <SPECIAL-CHAR>
\t\t\t\tFax <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <NUMBER> <NUMBER> <NUMBER>
\t\t\t\tAugust <NUMBER> <SPECIAL-CHAR> <NUMBER>
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> PM <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> PM
\t\t\t\tand all text on other lines it seems <SPECIAL-CHAR>
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TNUMBER m TNUMBER Zi TNUMBER TSPECIALCHAR
\t\t\t\tFax TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
\t\t\t\tAugust TNUMBER TSPECIALCHAR TNUMBER
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER PM TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER PM
\t\t\t\tand all text on other lines it seems TSPECIALCHAR
Button Text
this text is shown`
this text is shown
sender
toRecipients
ccRecipients
bccRecipients
authStatus`
o.check(preprocessedMail).equals(expectedOutput)
})
@ -357,8 +387,24 @@ this text is shown`
await spamClassifier.initialize("firstGroup")
await spamClassifier.initialize("secondGroup")
const isSpamFirstMail = await spamClassifier.predict({ subject: "", body: "", ownerGroup: "firstGroup" })
const isSpamSecondMail = await spamClassifier.predict({ subject: "", body: "", ownerGroup: "secondGroup" })
const commonSpamFields = {
subject: "",
body: "",
sender: "string",
toRecipients: "string",
ccRecipients: "string",
bccRecipients: "string",
authStatus: "",
}
const isSpamFirstMail = await spamClassifier.predict({
ownerGroup: "firstGroup",
...commonSpamFields,
})
const isSpamSecondMail = await spamClassifier.predict({
ownerGroup: "secondGroup",
...commonSpamFields,
})
o(isSpamFirstMail).equals(true)
o(isSpamSecondMail).equals(false)
@ -434,48 +480,6 @@ if (DO_RUN_PERFORMANCE_ANALYSIS) {
let retrainCount = 0
let predictedSpam = false
while (!predictedSpam && retrainCount++ <= 3) {
// await copiedClassifier.updateModel([{ ...sample, isSpam: false }])
/*
isSpamConfidence: 2
[
3, 2, 1, 3, 1,
1, 3, 2, 1, 5
] = 22
isSpamConfidence: 3
[
2, 5, 1, 2, 1,
1, 1, 2, 1, 2
] = 18
isSpamConfidence: 4
[
1, 1, 1, 2, 5,
1, 1, 1, 1, 5
] = 19
Retraining finished. Took: 477ms
Retraining finished. Took: 1259ms
predicted new mail to be with probability 0.46 spam
Retraining finished. Took: 560ms
Retraining finished. Took: 1273ms
isSpamConfidence: 8
Retraining finished. Took: 486ms
Retraining finished. Took: 2289ms
predicted new mail to be with probability 0.82 spam
Retraining finished. Took: 580ms
Retraining finished. Took: 2356ms
predicted new mail to be with probability 1.00 spam
Retraining finished. Took: 556ms
Retraining finished. Took: 2357ms
predicted new mail to be with probability 0.52 spam
[
1, 1, 1, 1, 1,
1, 1, 1, 1, 1
]
*/
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true, isSpamConfidence: 1 }])
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
}

File diff suppressed because one or more lines are too long

View file

@ -6,12 +6,14 @@ import {
BodyTypeRef,
ClientSpamClassifierResultTypeRef,
Mail,
MailAddressTypeRef,
MailDetails,
MailDetailsBlob,
MailDetailsBlobTypeRef,
MailDetailsTypeRef,
MailFolderTypeRef,
MailTypeRef,
RecipientsTypeRef,
} from "../../../src/common/api/entities/tutanota/TypeRefs.js"
import { EntityClient } from "../../../src/common/api/common/EntityClient.js"
import { EntityRestClientMock } from "../api/worker/rest/EntityRestClientMock.js"
@ -140,6 +142,14 @@ o.spec("MailModelTest", function () {
mailDetails = createTestEntity(MailDetailsTypeRef, {
_id: "mailDetail",
body: createTestEntity(BodyTypeRef, { text: "some text" }),
recipients: createTestEntity(RecipientsTypeRef, {
toRecipients: [
createTestEntity(MailAddressTypeRef, {
name: "Recipient",
address: "recipient@tuta.com",
}),
],
}),
})
mail = createTestEntity(MailTypeRef, {
_id: ["mailListId", "mailId"],
@ -147,7 +157,9 @@ o.spec("MailModelTest", function () {
mailDetails: ["detailsList", mailDetails._id],
subject: "subject",
sets: [inboxFolder._id],
sender: createTestEntity(MailAddressTypeRef, { name: "Sender", address: "sender@tuta.com" }),
processingState: ProcessingState.INBOX_RULE_NOT_PROCESSED,
authStatus: "0",
})
const mailDetailsBlob: MailDetailsBlob = createTestEntity(MailDetailsBlobTypeRef, {
_id: mail.mailDetails!,
@ -296,6 +308,11 @@ o.spec("MailModelTest", function () {
subject: "subject",
isSpam: false,
isSpamConfidence: 1,
sender: "Sender sender@tuta.com",
toRecipients: "Recipient recipient@tuta.com",
ccRecipients: "",
bccRecipients: "",
authStatus: "TAUTHENTICATED",
}
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
verify(spamClassifier.predict(anything()), { times: 0 })
@ -321,6 +338,11 @@ o.spec("MailModelTest", function () {
subject: "subject",
isSpam: false,
isSpamConfidence: 1,
sender: "Sender sender@tuta.com",
toRecipients: "Recipient recipient@tuta.com",
ccRecipients: "",
bccRecipients: "",
authStatus: "TAUTHENTICATED",
}
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
verify(spamClassifier.predict(anything()), { times: 1 })
@ -352,6 +374,11 @@ o.spec("MailModelTest", function () {
subject: "subject",
isSpam: false,
isSpamConfidence: 1,
sender: "Sender sender@tuta.com",
toRecipients: "Recipient recipient@tuta.com",
ccRecipients: "",
bccRecipients: "",
authStatus: "TAUTHENTICATED",
}
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
verify(spamClassifier.predict(anything()), { times: 1 })