mirror of
https://github.com/tutao/tutanota.git
synced 2025-12-08 06:09:50 +00:00
Include header fields as tokens in the anti-spam
Add the header fields(sender, toRecipients, ccRecipients, bccRecipients, authStatus) to the anti-spam vectors. We also improve some of the preprocessing steps and add offline migrations by deleting old spam tables Co-authored-by: amm@tutao.de Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com>
This commit is contained in:
parent
21ad4ce2c3
commit
f8bbd32695
13 changed files with 10918 additions and 10788 deletions
|
|
@ -46,7 +46,7 @@ export const allowedImports = {
|
||||||
contacts: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "mail-view", "date", "date-gui", "mail-editor"],
|
contacts: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "mail-view", "date", "date-gui", "mail-editor"],
|
||||||
"calendar-view": ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "date", "date-gui", "sharing", "contacts"],
|
"calendar-view": ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main", "date", "date-gui", "sharing", "contacts"],
|
||||||
login: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main"],
|
login: ["polyfill-helpers", "common-min", "common", "boot", "gui-base", "main"],
|
||||||
"spam-classifier": ["polyfill-helpers", "common", "common-min"],
|
"spam-classifier": ["polyfill-helpers", "common", "common-min", "main"],
|
||||||
worker: ["polyfill-helpers", "common-min", "common", "native-common", "native-worker", "wasm", "wasm-fallback"],
|
worker: ["polyfill-helpers", "common-min", "common", "native-common", "native-worker", "wasm", "wasm-fallback"],
|
||||||
"pow-worker": [],
|
"pow-worker": [],
|
||||||
settings: [
|
settings: [
|
||||||
|
|
|
||||||
|
|
@ -46,9 +46,6 @@ import { AttributeModel } from "../../common/AttributeModel"
|
||||||
import { TypeModelResolver } from "../../common/EntityFunctions"
|
import { TypeModelResolver } from "../../common/EntityFunctions"
|
||||||
import { collapseId, expandId } from "../rest/RestClientIdUtils"
|
import { collapseId, expandId } from "../rest/RestClientIdUtils"
|
||||||
import { Category, syncMetrics } from "../utils/SyncMetrics"
|
import { Category, syncMetrics } from "../utils/SyncMetrics"
|
||||||
import { hasError } from "../../common/utils/ErrorUtils"
|
|
||||||
import { SpamClassificationModel, SpamTrainMailDatum } from "../../../../mail-app/workerUtils/spamClassification/SpamClassifier"
|
|
||||||
import { Mail } from "../../entities/tutanota/TypeRefs"
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this is the value of SQLITE_MAX_VARIABLE_NUMBER in sqlite3.c
|
* this is the value of SQLITE_MAX_VARIABLE_NUMBER in sqlite3.c
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import { offline6 } from "./migrations/offline-v6"
|
||||||
import { offline7 } from "./migrations/offline-v7"
|
import { offline7 } from "./migrations/offline-v7"
|
||||||
import { offline8 } from "./migrations/offline-v8"
|
import { offline8 } from "./migrations/offline-v8"
|
||||||
import { ProgrammingError } from "../../common/error/ProgrammingError"
|
import { ProgrammingError } from "../../common/error/ProgrammingError"
|
||||||
|
import { offline9 } from "./migrations/offline-v9"
|
||||||
|
|
||||||
export interface OfflineMigration {
|
export interface OfflineMigration {
|
||||||
readonly version: number
|
readonly version: number
|
||||||
|
|
@ -20,11 +21,11 @@ export interface OfflineMigration {
|
||||||
* Normally you should only add them to the end of the list but with offline ones it can be a bit tricky since they change the db structure itself so sometimes
|
* Normally you should only add them to the end of the list but with offline ones it can be a bit tricky since they change the db structure itself so sometimes
|
||||||
* they should rather be in the beginning.
|
* they should rather be in the beginning.
|
||||||
*/
|
*/
|
||||||
export const OFFLINE_STORAGE_MIGRATIONS: ReadonlyArray<OfflineMigration> = [offline5, offline6, offline7, offline8]
|
export const OFFLINE_STORAGE_MIGRATIONS: ReadonlyArray<OfflineMigration> = [offline5, offline6, offline7, offline8, offline9]
|
||||||
|
|
||||||
// in cases where the actual migration is not there anymore (we clean up old migrations no client would apply anymore)
|
// in cases where the actual migration is not there anymore (we clean up old migrations no client would apply anymore)
|
||||||
// and we create a new offline database, we still need to set the offline version to the current value.
|
// and we create a new offline database, we still need to set the offline version to the current value.
|
||||||
export const CURRENT_OFFLINE_VERSION = 8
|
export const CURRENT_OFFLINE_VERSION = 9
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Migrator for the offline storage between different versions of model. It is tightly couples to the versions of API entities: every time we make an
|
* Migrator for the offline storage between different versions of model. It is tightly couples to the versions of API entities: every time we make an
|
||||||
|
|
|
||||||
12
src/common/api/worker/offline/migrations/offline-v9.ts
Normal file
12
src/common/api/worker/offline/migrations/offline-v9.ts
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
import { OfflineMigration } from "../OfflineStorageMigrator.js"
|
||||||
|
import { OfflineStorage } from "../OfflineStorage.js"
|
||||||
|
import { SqlCipherFacade } from "../../../../native/common/generatedipc/SqlCipherFacade"
|
||||||
|
|
||||||
|
export const offline9: OfflineMigration = {
|
||||||
|
version: 9,
|
||||||
|
async migrate(storage: OfflineStorage, sqlCipherFacade: SqlCipherFacade) {
|
||||||
|
console.log("dropping spam_classification_training_data and spam_classification_model, due to new fields")
|
||||||
|
await sqlCipherFacade.run(`DROP TABLE IF EXISTS spam_classification_training_data`, [])
|
||||||
|
await sqlCipherFacade.run(`DROP TABLE IF EXISTS spam_classification_model`, [])
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
@ -1,13 +1,14 @@
|
||||||
import { createMoveMailData, Mail, MailDetails, MailFolder, MoveMailData } from "../../../common/api/entities/tutanota/TypeRefs"
|
import { createMoveMailData, Mail, MailAddress, MailDetails, MailFolder, MoveMailData } from "../../../common/api/entities/tutanota/TypeRefs"
|
||||||
import {
|
import {
|
||||||
DEFAULT_IS_SPAM,
|
DEFAULT_IS_SPAM,
|
||||||
DEFAULT_IS_SPAM_CONFIDENCE,
|
DEFAULT_IS_SPAM_CONFIDENCE,
|
||||||
getSpamConfidence,
|
getSpamConfidence,
|
||||||
|
MailAuthenticationStatus,
|
||||||
MailSetKind,
|
MailSetKind,
|
||||||
ProcessingState,
|
ProcessingState,
|
||||||
SpamDecision,
|
SpamDecision,
|
||||||
} from "../../../common/api/common/TutanotaConstants"
|
} from "../../../common/api/common/TutanotaConstants"
|
||||||
import type { SpamClassifier, SpamPredMailDatum, SpamTrainMailDatum } from "../../workerUtils/spamClassification/SpamClassifier"
|
import { SpamClassifier, SpamPredMailDatum, SpamTrainMailDatum } from "../../workerUtils/spamClassification/SpamClassifier"
|
||||||
import { getMailBodyText } from "../../../common/api/common/CommonMailUtils"
|
import { getMailBodyText } from "../../../common/api/common/CommonMailUtils"
|
||||||
import { assertNotNull, debounce, isNotNull, Nullable, ofClass } from "@tutao/tutanota-utils"
|
import { assertNotNull, debounce, isNotNull, Nullable, ofClass } from "@tutao/tutanota-utils"
|
||||||
import { MailFacade } from "../../../common/api/worker/facades/lazy/MailFacade"
|
import { MailFacade } from "../../../common/api/worker/facades/lazy/MailFacade"
|
||||||
|
|
@ -75,6 +76,7 @@ export class SpamClassificationHandler {
|
||||||
subject: mail.subject,
|
subject: mail.subject,
|
||||||
body: getMailBodyText(mailDetails.body),
|
body: getMailBodyText(mailDetails.body),
|
||||||
ownerGroup: assertNotNull(mail._ownerGroup),
|
ownerGroup: assertNotNull(mail._ownerGroup),
|
||||||
|
...extractSpamHeaderFeatures(mail, mailDetails),
|
||||||
}
|
}
|
||||||
const isSpam = (await this.spamClassifier?.predict(spamPredMailDatum)) ?? null
|
const isSpam = (await this.spamClassifier?.predict(spamPredMailDatum)) ?? null
|
||||||
|
|
||||||
|
|
@ -141,7 +143,44 @@ export class SpamClassificationHandler {
|
||||||
isSpam: DEFAULT_IS_SPAM,
|
isSpam: DEFAULT_IS_SPAM,
|
||||||
isSpamConfidence: DEFAULT_IS_SPAM_CONFIDENCE,
|
isSpamConfidence: DEFAULT_IS_SPAM_CONFIDENCE,
|
||||||
ownerGroup: assertNotNull(mail._ownerGroup),
|
ownerGroup: assertNotNull(mail._ownerGroup),
|
||||||
|
...extractSpamHeaderFeatures(mail, mailDetails),
|
||||||
}
|
}
|
||||||
await this.spamClassifier?.storeSpamClassification(spamTrainMailDatum)
|
await this.spamClassifier?.storeSpamClassification(spamTrainMailDatum)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function extractSpamHeaderFeatures(mail: Mail, mailDetails: MailDetails) {
|
||||||
|
const sender = joinNamesAndMailAddresses([mail?.sender])
|
||||||
|
const { toRecipients, ccRecipients, bccRecipients } = extractRecipients(mailDetails)
|
||||||
|
const authStatus = convertAuthStatusToSpamCategorizationToken(mail.authStatus)
|
||||||
|
|
||||||
|
return { sender, toRecipients, ccRecipients, bccRecipients, authStatus }
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractRecipients({ recipients }: MailDetails) {
|
||||||
|
const toRecipients = joinNamesAndMailAddresses(recipients?.toRecipients)
|
||||||
|
const ccRecipients = joinNamesAndMailAddresses(recipients?.ccRecipients)
|
||||||
|
const bccRecipients = joinNamesAndMailAddresses(recipients?.bccRecipients)
|
||||||
|
|
||||||
|
return { toRecipients, ccRecipients, bccRecipients }
|
||||||
|
}
|
||||||
|
|
||||||
|
function joinNamesAndMailAddresses(recipients: MailAddress[] | null) {
|
||||||
|
return recipients?.map((recipient) => `${recipient?.name} ${recipient?.address}`).join(" ") || ""
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertAuthStatusToSpamCategorizationToken(authStatus: string | null): string {
|
||||||
|
if (authStatus === MailAuthenticationStatus.AUTHENTICATED) {
|
||||||
|
return "TAUTHENTICATED"
|
||||||
|
} else if (authStatus === MailAuthenticationStatus.HARD_FAIL) {
|
||||||
|
return "THARDFAIL"
|
||||||
|
} else if (authStatus === MailAuthenticationStatus.SOFT_FAIL) {
|
||||||
|
return "TSOFTFAIL"
|
||||||
|
} else if (authStatus === MailAuthenticationStatus.INVALID_MAIL_FROM) {
|
||||||
|
return "TINVALIDMAILFROM"
|
||||||
|
} else if (authStatus === MailAuthenticationStatus.MISSING_MAIL_FROM) {
|
||||||
|
return "TMISSINGMAILFROM"
|
||||||
|
}
|
||||||
|
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -67,12 +67,14 @@ export const SearchTableDefinitions: Record<string, OfflineStorageTable> = Objec
|
||||||
})
|
})
|
||||||
|
|
||||||
export const SpamClassificationDefinitions: Record<string, OfflineStorageTable> = Object.freeze({
|
export const SpamClassificationDefinitions: Record<string, OfflineStorageTable> = Object.freeze({
|
||||||
// Spam classification training data
|
|
||||||
spam_classification_training_data: {
|
spam_classification_training_data: {
|
||||||
definition:
|
definition:
|
||||||
"CREATE TABLE IF NOT EXISTS spam_classification_training_data (listId TEXT NOT NULL, elementId TEXT NOT NULL," +
|
"CREATE TABLE IF NOT EXISTS spam_classification_training_data (listId TEXT NOT NULL, elementId TEXT NOT NULL," +
|
||||||
" ownerGroup TEXT NOT NULL, subject TEXT NOT NULL, body TEXT NOT NULL, isSpam NUMBER, " +
|
"ownerGroup TEXT NOT NULL, subject TEXT NOT NULL, body TEXT NOT NULL, isSpam NUMBER," +
|
||||||
"lastModified NUMBER NOT NULL, isSpamConfidence NUMBER NOT NULL, PRIMARY KEY (listId, elementId))",
|
"lastModified NUMBER NOT NULL, isSpamConfidence NUMBER NOT NULL, sender TEXT NOT NULL," +
|
||||||
|
"toRecipients TEXT NOT NULL, ccRecipients TEXT NOT NULL, bccRecipients TEXT NOT NULL," +
|
||||||
|
"authStatus TEXT NOT NULL, PRIMARY KEY (listId, elementId))",
|
||||||
|
|
||||||
purgedWithCache: true,
|
purgedWithCache: true,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
@ -187,18 +189,24 @@ export class OfflineStoragePersistence {
|
||||||
|
|
||||||
async storeSpamClassification(spamTrainMailDatum: SpamTrainMailDatum): Promise<void> {
|
async storeSpamClassification(spamTrainMailDatum: SpamTrainMailDatum): Promise<void> {
|
||||||
const { query, params } = sql`
|
const { query, params } = sql`
|
||||||
INSERT
|
INSERT
|
||||||
OR REPLACE INTO spam_classification_training_data(listId, elementId, ownerGroup, subject, body, isSpam, lastModified, isSpamConfidence)
|
OR REPLACE INTO spam_classification_training_data(listId, elementId, ownerGroup, subject, body, isSpam,
|
||||||
|
lastModified, isSpamConfidence, sender, toRecipients, ccRecipients, bccRecipients, authStatus)
|
||||||
VALUES (
|
VALUES (
|
||||||
${listIdPart(spamTrainMailDatum.mailId)},
|
${listIdPart(spamTrainMailDatum.mailId)},
|
||||||
${elementIdPart(spamTrainMailDatum.mailId)},
|
${elementIdPart(spamTrainMailDatum.mailId)},
|
||||||
${spamTrainMailDatum.ownerGroup},
|
${spamTrainMailDatum.ownerGroup},
|
||||||
${spamTrainMailDatum.subject},
|
${spamTrainMailDatum.subject},
|
||||||
${spamTrainMailDatum.body},
|
${spamTrainMailDatum.body},
|
||||||
${spamTrainMailDatum.isSpam ? 1 : 0},
|
${spamTrainMailDatum.isSpam ? 1 : 0},
|
||||||
${Date.now()},
|
${Date.now()},
|
||||||
${spamTrainMailDatum.isSpamConfidence}
|
${spamTrainMailDatum.isSpamConfidence},
|
||||||
)`
|
${spamTrainMailDatum.sender},
|
||||||
|
${spamTrainMailDatum.toRecipients},
|
||||||
|
${spamTrainMailDatum.ccRecipients},
|
||||||
|
${spamTrainMailDatum.bccRecipients},
|
||||||
|
${spamTrainMailDatum.authStatus}
|
||||||
|
)`
|
||||||
await this.sqlCipherFacade.run(query, params)
|
await this.sqlCipherFacade.run(query, params)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -250,11 +258,21 @@ export class OfflineStoragePersistence {
|
||||||
}
|
}
|
||||||
|
|
||||||
async getCertainSpamClassificationTrainingDataAfterCutoff(cutoffTimestamp: number, ownerGroupId: Id): Promise<SpamTrainMailDatum[]> {
|
async getCertainSpamClassificationTrainingDataAfterCutoff(cutoffTimestamp: number, ownerGroupId: Id): Promise<SpamTrainMailDatum[]> {
|
||||||
const { query, params } = sql`SELECT listId, elementId, subject, body, isSpam, isSpamConfidence
|
const { query, params } = sql`SELECT listId,
|
||||||
FROM spam_classification_training_data
|
elementId,
|
||||||
WHERE lastModified > ${cutoffTimestamp}
|
subject,
|
||||||
AND isSpamConfidence > 0
|
body,
|
||||||
AND ownerGroup = ${ownerGroupId}`
|
isSpam,
|
||||||
|
isSpamConfidence,
|
||||||
|
sender,
|
||||||
|
toRecipients,
|
||||||
|
ccRecipients,
|
||||||
|
bccRecipients,
|
||||||
|
authStatus
|
||||||
|
FROM spam_classification_training_data
|
||||||
|
WHERE lastModified > ${cutoffTimestamp}
|
||||||
|
AND isSpamConfidence > 0
|
||||||
|
AND ownerGroup = ${ownerGroupId}`
|
||||||
const resultRows = await this.sqlCipherFacade.all(query, params)
|
const resultRows = await this.sqlCipherFacade.all(query, params)
|
||||||
return resultRows.map(untagSqlObject).map((row) => row as unknown as SpamTrainMailDatum)
|
return resultRows.map(untagSqlObject).map((row) => row as unknown as SpamTrainMailDatum)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,30 +9,30 @@ export const ML_DATE_REGEX = [
|
||||||
/\b(?<!-)\d{4}(?:-\d{1,2}){2}(?!-)\b/g, // 2023-12-01 | 2023-12-1
|
/\b(?<!-)\d{4}(?:-\d{1,2}){2}(?!-)\b/g, // 2023-12-01 | 2023-12-1
|
||||||
]
|
]
|
||||||
|
|
||||||
export const ML_DATE_TOKEN = " <DATE> "
|
export const ML_DATE_TOKEN = " TDATE "
|
||||||
|
|
||||||
export const ML_URL_REGEX = /(?:http|https|ftp|sftp):\/\/([\w.-]+)(?:\/[^\s]*)?/g
|
export const ML_URL_REGEX = /(?:http|https|ftp|sftp):\/\/([\w.-]+)(?:\/[^\s]*)?/g
|
||||||
|
|
||||||
export const ML_URL_TOKEN = " <URL-$1> "
|
export const ML_URL_TOKEN = " TURL$1 "
|
||||||
|
|
||||||
export const ML_EMAIL_ADDR_REGEX = /(?:mailto:)?[A-Za-z0-9_+\-.]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g
|
export const ML_EMAIL_ADDR_REGEX = /(?:mailto:)?[A-Za-z0-9_+\-.]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g
|
||||||
export const ML_EMAIL_ADDR_TOKEN = " <EMAIL> "
|
export const ML_EMAIL_ADDR_TOKEN = " TEMAIL "
|
||||||
|
|
||||||
export const ML_BITCOIN_REGEX = /\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b/g
|
export const ML_BITCOIN_REGEX = /\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b/g
|
||||||
|
|
||||||
export const ML_BITCOIN_TOKEN = " <BITCOIN> "
|
export const ML_BITCOIN_TOKEN = " TBITCOIN "
|
||||||
|
|
||||||
export const ML_CREDIT_CARD_REGEX = /\b(\d{4}\s?){4}\b|\b[0-9]\d{13,16}\b/g
|
export const ML_CREDIT_CARD_REGEX = /\b(\d{4}\s?){4}\b|\b[0-9]\d{13,16}\b/g
|
||||||
|
|
||||||
export const ML_CREDIT_CARD_TOKEN = " <CREDIT-CARD> "
|
export const ML_CREDIT_CARD_TOKEN = " TCREDITCARD "
|
||||||
|
|
||||||
export const ML_NUMBER_SEQUENCE_REGEX = /\b\d+\b/g
|
export const ML_NUMBER_SEQUENCE_REGEX = /\b\d+\b/g
|
||||||
|
|
||||||
export const ML_NUMBER_SEQUENCE_TOKEN = " <NUMBER> "
|
export const ML_NUMBER_SEQUENCE_TOKEN = " TNUMBER "
|
||||||
|
|
||||||
export const ML_SPECIAL_CHARACTER_REGEX = /([!@#$%^&*()+`_=\\{}"':;?/,.~]+)(?![^<]*>)|(?!\w)[-]+(?!\w)/g
|
export const ML_SPECIAL_CHARACTER_REGEX = /([!@#$%^&*()[\]<>+`_=\\{}"':;?/,-.~]+)/g
|
||||||
|
|
||||||
export const ML_SPECIAL_CHARACTER_TOKEN = " <SPECIAL-CHAR> "
|
export const ML_SPECIAL_CHARACTER_TOKEN = " TSPECIALCHAR "
|
||||||
|
|
||||||
export const ML_SPACE_BEFORE_NEW_LINE_REGEX = /\s+\n/g
|
export const ML_SPACE_BEFORE_NEW_LINE_REGEX = /\s+\n/g
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,16 @@
|
||||||
import { EntityClient } from "../../../common/api/common/EntityClient"
|
import { EntityClient } from "../../../common/api/common/EntityClient"
|
||||||
import { assertNotNull, isNotNull, lazyAsync } from "@tutao/tutanota-utils"
|
import { assertNotNull, isNotNull, lazyAsync } from "@tutao/tutanota-utils"
|
||||||
import { MailBag, MailboxGroupRootTypeRef, MailBoxTypeRef, MailFolder, MailFolderTypeRef, MailTypeRef } from "../../../common/api/entities/tutanota/TypeRefs"
|
import {
|
||||||
|
MailAddress,
|
||||||
|
MailBag,
|
||||||
|
MailboxGroupRootTypeRef,
|
||||||
|
MailBoxTypeRef,
|
||||||
|
MailDetails,
|
||||||
|
MailFolder,
|
||||||
|
MailFolderTypeRef,
|
||||||
|
MailTypeRef,
|
||||||
|
Recipients,
|
||||||
|
} from "../../../common/api/entities/tutanota/TypeRefs"
|
||||||
import { getMailSetKind, getSpamConfidence, MailSetKind } from "../../../common/api/common/TutanotaConstants"
|
import { getMailSetKind, getSpamConfidence, MailSetKind } from "../../../common/api/common/TutanotaConstants"
|
||||||
import { elementIdPart, isSameId, listIdPart, timestampToGeneratedId } from "../../../common/api/common/utils/EntityUtils"
|
import { elementIdPart, isSameId, listIdPart, timestampToGeneratedId } from "../../../common/api/common/utils/EntityUtils"
|
||||||
import { OfflineStoragePersistence } from "../index/OfflineStoragePersistence"
|
import { OfflineStoragePersistence } from "../index/OfflineStoragePersistence"
|
||||||
|
|
@ -8,6 +18,7 @@ import { getMailBodyText } from "../../../common/api/common/CommonMailUtils"
|
||||||
import { BulkMailLoader, MailWithMailDetails } from "../index/BulkMailLoader"
|
import { BulkMailLoader, MailWithMailDetails } from "../index/BulkMailLoader"
|
||||||
import { hasError } from "../../../common/api/common/utils/ErrorUtils"
|
import { hasError } from "../../../common/api/common/utils/ErrorUtils"
|
||||||
import { SpamTrainMailDatum } from "./SpamClassifier"
|
import { SpamTrainMailDatum } from "./SpamClassifier"
|
||||||
|
import { extractSpamHeaderFeatures } from "../../mail/model/SpamClassificationHandler"
|
||||||
|
|
||||||
const INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS = 28
|
const INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS = 28
|
||||||
|
|
||||||
|
|
@ -30,7 +41,6 @@ export class SpamClassificationInitializer {
|
||||||
// available in the current mail bag
|
// available in the current mail bag
|
||||||
const data = await this.downloadMailAndMailDetailsByGroupMembership(ownerGroup)
|
const data = await this.downloadMailAndMailDetailsByGroupMembership(ownerGroup)
|
||||||
data.filter((datum) => datum.isSpamConfidence > 0)
|
data.filter((datum) => datum.isSpamConfidence > 0)
|
||||||
data.map((datum) => this.offlineStorage.storeSpamClassification(datum))
|
|
||||||
|
|
||||||
let spamMailsCount = 0
|
let spamMailsCount = 0
|
||||||
let hamMailsCount = 0
|
let hamMailsCount = 0
|
||||||
|
|
@ -53,7 +63,6 @@ export class SpamClassificationInitializer {
|
||||||
const mailbox = await this.entityClient.load(MailBoxTypeRef, mailboxGroupRoot.mailbox)
|
const mailbox = await this.entityClient.load(MailBoxTypeRef, mailboxGroupRoot.mailbox)
|
||||||
const mailSets = await this.entityClient.loadAll(MailFolderTypeRef, assertNotNull(mailbox.folders).folders)
|
const mailSets = await this.entityClient.loadAll(MailFolderTypeRef, assertNotNull(mailbox.folders).folders)
|
||||||
const spamFolder = mailSets.find((s) => getMailSetKind(s) === MailSetKind.SPAM)!
|
const spamFolder = mailSets.find((s) => getMailSetKind(s) === MailSetKind.SPAM)!
|
||||||
const inboxFolder = mailSets.find((s) => getMailSetKind(s) === MailSetKind.INBOX)!
|
|
||||||
|
|
||||||
const downloadedMailClassificationDatas = new Array<SpamTrainMailDatum>()
|
const downloadedMailClassificationDatas = new Array<SpamTrainMailDatum>()
|
||||||
const allMailbags = [assertNotNull(mailbox.currentMailBag), ...mailbox.archivedMailBags].reverse() // sorted from latest to oldest
|
const allMailbags = [assertNotNull(mailbox.currentMailBag), ...mailbox.archivedMailBags].reverse() // sorted from latest to oldest
|
||||||
|
|
@ -63,14 +72,14 @@ export class SpamClassificationInitializer {
|
||||||
isNotNull(currentMailbag) && downloadedMailClassificationDatas.length < this.MIN_MAILS_COUNT;
|
isNotNull(currentMailbag) && downloadedMailClassificationDatas.length < this.MIN_MAILS_COUNT;
|
||||||
currentMailbag = allMailbags.pop()
|
currentMailbag = allMailbags.pop()
|
||||||
) {
|
) {
|
||||||
const mailsOfThisMailbag = await this.downloadMailAndMailDetailsByMailbag(currentMailbag, spamFolder, inboxFolder)
|
const mailsOfThisMailbag = await this.downloadMailAndMailDetailsByMailbag(currentMailbag, spamFolder)
|
||||||
downloadedMailClassificationDatas.push(...mailsOfThisMailbag)
|
downloadedMailClassificationDatas.push(...mailsOfThisMailbag)
|
||||||
}
|
}
|
||||||
|
|
||||||
return downloadedMailClassificationDatas
|
return downloadedMailClassificationDatas
|
||||||
}
|
}
|
||||||
|
|
||||||
private async downloadMailAndMailDetailsByMailbag(mailbag: MailBag, spamFolder: MailFolder, inboxFolder: MailFolder): Promise<Array<SpamTrainMailDatum>> {
|
private async downloadMailAndMailDetailsByMailbag(mailbag: MailBag, spamFolder: MailFolder): Promise<Array<SpamTrainMailDatum>> {
|
||||||
const { LocalTimeDateProvider } = await import("../../../common/api/worker/DateProvider.js")
|
const { LocalTimeDateProvider } = await import("../../../common/api/worker/DateProvider.js")
|
||||||
const dateProvider = new LocalTimeDateProvider()
|
const dateProvider = new LocalTimeDateProvider()
|
||||||
const startTime = dateProvider.getStartOfDayShiftedBy(this.TIME_LIMIT).getTime()
|
const startTime = dateProvider.getStartOfDayShiftedBy(this.TIME_LIMIT).getTime()
|
||||||
|
|
@ -84,11 +93,12 @@ export class SpamClassificationInitializer {
|
||||||
// Download mail details
|
// Download mail details
|
||||||
.then((mails) => bulkMailLoader.loadMailDetails(mails))
|
.then((mails) => bulkMailLoader.loadMailDetails(mails))
|
||||||
// Map to spam mail datum
|
// Map to spam mail datum
|
||||||
.then((mails) => mails.map((m) => this.mailWithDetailsToMailDatum(spamFolder, inboxFolder, m)))
|
.then((mails) => mails.map((m) => this.mailWithDetailsToMailDatum(spamFolder, m)))
|
||||||
}
|
}
|
||||||
|
|
||||||
private mailWithDetailsToMailDatum(spamFolder: MailFolder, inboxFolder: MailFolder, { mail, mailDetails }: MailWithMailDetails): SpamTrainMailDatum {
|
private mailWithDetailsToMailDatum(spamFolder: MailFolder, { mail, mailDetails }: MailWithMailDetails): SpamTrainMailDatum {
|
||||||
const isSpam = mail.sets.some((folderId) => isSameId(folderId, spamFolder._id))
|
const isSpam = mail.sets.some((folderId) => isSameId(folderId, spamFolder._id))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
mailId: mail._id,
|
mailId: mail._id,
|
||||||
subject: mail.subject,
|
subject: mail.subject,
|
||||||
|
|
@ -98,6 +108,7 @@ export class SpamClassificationInitializer {
|
||||||
listId: listIdPart(mail._id),
|
listId: listIdPart(mail._id),
|
||||||
elementId: elementIdPart(mail._id),
|
elementId: elementIdPart(mail._id),
|
||||||
ownerGroup: assertNotNull(mail._ownerGroup),
|
ownerGroup: assertNotNull(mail._ownerGroup),
|
||||||
|
...extractSpamHeaderFeatures(mail, mailDetails),
|
||||||
} as SpamTrainMailDatum
|
} as SpamTrainMailDatum
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -54,12 +54,22 @@ export type SpamTrainMailDatum = {
|
||||||
isSpam: boolean
|
isSpam: boolean
|
||||||
isSpamConfidence: number
|
isSpamConfidence: number
|
||||||
ownerGroup: Id
|
ownerGroup: Id
|
||||||
|
sender: string
|
||||||
|
toRecipients: string
|
||||||
|
ccRecipients: string
|
||||||
|
bccRecipients: string
|
||||||
|
authStatus: string
|
||||||
}
|
}
|
||||||
|
|
||||||
export type SpamPredMailDatum = {
|
export type SpamPredMailDatum = {
|
||||||
subject: string
|
subject: string
|
||||||
body: string
|
body: string
|
||||||
ownerGroup: Id
|
ownerGroup: Id
|
||||||
|
sender: string
|
||||||
|
toRecipients: string
|
||||||
|
ccRecipients: string
|
||||||
|
bccRecipients: string
|
||||||
|
authStatus: string
|
||||||
}
|
}
|
||||||
|
|
||||||
const PREDICTION_THRESHOLD = 0.55
|
const PREDICTION_THRESHOLD = 0.55
|
||||||
|
|
@ -223,9 +233,16 @@ export class SpamClassifier {
|
||||||
preprocessedMail = preprocessedMail.replaceAll(ML_SPACE_BEFORE_NEW_LINE_REGEX, ML_SPACE_BEFORE_NEW_LINE_TOKEN)
|
preprocessedMail = preprocessedMail.replaceAll(ML_SPACE_BEFORE_NEW_LINE_REGEX, ML_SPACE_BEFORE_NEW_LINE_TOKEN)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
preprocessedMail += this.getHeaderFeatures(mail)
|
||||||
|
|
||||||
return preprocessedMail
|
return preprocessedMail
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private getHeaderFeatures(mail: SpamTrainMailDatum | SpamPredMailDatum): string {
|
||||||
|
const { sender, toRecipients, ccRecipients, bccRecipients, authStatus } = mail
|
||||||
|
return `\n${sender}\n${toRecipients}\n${ccRecipients}\n${bccRecipients}\n${authStatus}`
|
||||||
|
}
|
||||||
|
|
||||||
public async initialTraining(mails: SpamTrainMailDatum[]): Promise<TrainingPerformance> {
|
public async initialTraining(mails: SpamTrainMailDatum[]): Promise<TrainingPerformance> {
|
||||||
const preprocessingStart = performance.now()
|
const preprocessingStart = performance.now()
|
||||||
const tokenizedMails = await promiseMap(mails, (mail) => spamClassifierTokenizer(this.preprocessMail(mail)))
|
const tokenizedMails = await promiseMap(mails, (mail) => spamClassifierTokenizer(this.preprocessMail(mail)))
|
||||||
|
|
@ -497,7 +514,8 @@ export class SpamClassifier {
|
||||||
private concatSubjectAndBody(mail: SpamTrainMailDatum | SpamPredMailDatum) {
|
private concatSubjectAndBody(mail: SpamTrainMailDatum | SpamPredMailDatum) {
|
||||||
const subject = mail.subject || ""
|
const subject = mail.subject || ""
|
||||||
const body = mail.body || ""
|
const body = mail.body || ""
|
||||||
const concatenated = `${subject} ${body}`.trim()
|
const concatenated = `${subject}\n${body}`.trim()
|
||||||
|
|
||||||
return concatenated.length > 0 ? concatenated : " "
|
return concatenated.length > 0 ? concatenated : " "
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -130,11 +130,11 @@ o.spec("PreprocessPatterns", () => {
|
||||||
o.spec("Url patterns", () => {
|
o.spec("Url patterns", () => {
|
||||||
o.test("All recognized url patterns", async () => {
|
o.test("All recognized url patterns", async () => {
|
||||||
const urlsMap = new Map([
|
const urlsMap = new Map([
|
||||||
["https://tuta.com", "<URL-tuta.com>"],
|
["https://tuta.com", "TURLtuta.com"],
|
||||||
["https://microsoft.com/outlook/test", "<URL-microsoft.com>"],
|
["https://microsoft.com/outlook/test", "TURLmicrosoft.com"],
|
||||||
["https://subdomain.microsoft.com/outlook/test", "<URL-subdomain.microsoft.com>"],
|
["https://subdomain.microsoft.com/outlook/test", "TURLsubdomain.microsoft.com"],
|
||||||
["https://subdomain.spam.com/this/is/not/cool/dsfalkfjd2309jlk234oi2k", "<URL-subdomain.spam.com>"],
|
["https://subdomain.spam.com/this/is/not/cool/dsfalkfjd2309jlk234oi2k", "TURLsubdomain.spam.com"],
|
||||||
["https://subdomain.test.de/spam!", "<URL-subdomain.test.de>"],
|
["https://subdomain.test.de/spam!", "TURLsubdomain.test.de"],
|
||||||
])
|
])
|
||||||
|
|
||||||
for (const [domain, expectedToken] of urlsMap.entries()) {
|
for (const [domain, expectedToken] of urlsMap.entries()) {
|
||||||
|
|
@ -297,6 +297,8 @@ o.spec("PreprocessPatterns", () => {
|
||||||
["*", ML_SPECIAL_CHARACTER_TOKEN],
|
["*", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
["(", ML_SPECIAL_CHARACTER_TOKEN],
|
["(", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
[")", ML_SPECIAL_CHARACTER_TOKEN],
|
[")", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
|
["<", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
|
[">", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
["+", ML_SPECIAL_CHARACTER_TOKEN],
|
["+", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
["`", ML_SPECIAL_CHARACTER_TOKEN],
|
["`", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
["_", ML_SPECIAL_CHARACTER_TOKEN],
|
["_", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
|
|
@ -318,6 +320,7 @@ o.spec("PreprocessPatterns", () => {
|
||||||
["--", ML_SPECIAL_CHARACTER_TOKEN],
|
["--", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
["---", ML_SPECIAL_CHARACTER_TOKEN],
|
["---", ML_SPECIAL_CHARACTER_TOKEN],
|
||||||
["--- ---", `${ML_SPECIAL_CHARACTER_TOKEN} ${ML_SPECIAL_CHARACTER_TOKEN}`],
|
["--- ---", `${ML_SPECIAL_CHARACTER_TOKEN} ${ML_SPECIAL_CHARACTER_TOKEN}`],
|
||||||
|
["[ ]", `${ML_SPECIAL_CHARACTER_TOKEN} ${ML_SPECIAL_CHARACTER_TOKEN}`],
|
||||||
])
|
])
|
||||||
|
|
||||||
for (const [specialCharSequence, expectedResult] of specialCharsMap) {
|
for (const [specialCharSequence, expectedResult] of specialCharsMap) {
|
||||||
|
|
@ -327,7 +330,7 @@ o.spec("PreprocessPatterns", () => {
|
||||||
})
|
})
|
||||||
|
|
||||||
o.test("Not recognized special-character-like patterns", async () => {
|
o.test("Not recognized special-character-like patterns", async () => {
|
||||||
const notSpecialChars = ["[", "]", "<", ">", "test-test"]
|
const notSpecialChars = ["§", "€"]
|
||||||
|
|
||||||
const notSpecialCharsText = notSpecialChars.join("\n")
|
const notSpecialCharsText = notSpecialChars.join("\n")
|
||||||
let resultNotSpecialCharsText = notSpecialCharsText
|
let resultNotSpecialCharsText = notSpecialCharsText
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import { parseCsv } from "../../../../../../src/common/misc/parsing/CsvParser"
|
||||||
import {
|
import {
|
||||||
DEFAULT_PREPROCESS_CONFIGURATION,
|
DEFAULT_PREPROCESS_CONFIGURATION,
|
||||||
SpamClassifier,
|
SpamClassifier,
|
||||||
spamClassifierTokenizer as testTokenize,
|
|
||||||
SpamTrainMailDatum,
|
SpamTrainMailDatum,
|
||||||
} from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassifier"
|
} from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassifier"
|
||||||
import { OfflineStoragePersistence } from "../../../../../../src/mail-app/workerUtils/index/OfflineStoragePersistence"
|
import { OfflineStoragePersistence } from "../../../../../../src/mail-app/workerUtils/index/OfflineStoragePersistence"
|
||||||
|
|
@ -36,6 +35,11 @@ export async function readMailDataFromCSV(filePath: string): Promise<{
|
||||||
const subject = row[8]
|
const subject = row[8]
|
||||||
const body = row[10]
|
const body = row[10]
|
||||||
const label = row[11]
|
const label = row[11]
|
||||||
|
const from = row[0]
|
||||||
|
const to = row[1]
|
||||||
|
const cc = row[2]
|
||||||
|
const bcc = row[3]
|
||||||
|
const authStatus = row[4]
|
||||||
|
|
||||||
let isSpam = label === "spam" ? true : label === "ham" ? false : null
|
let isSpam = label === "spam" ? true : label === "ham" ? false : null
|
||||||
isSpam = assertNotNull(isSpam, "Unknown label detected: " + label)
|
isSpam = assertNotNull(isSpam, "Unknown label detected: " + label)
|
||||||
|
|
@ -47,6 +51,11 @@ export async function readMailDataFromCSV(filePath: string): Promise<{
|
||||||
isSpam,
|
isSpam,
|
||||||
isSpamConfidence: 1,
|
isSpamConfidence: 1,
|
||||||
ownerGroup: "owner",
|
ownerGroup: "owner",
|
||||||
|
sender: from,
|
||||||
|
toRecipients: to,
|
||||||
|
ccRecipients: cc,
|
||||||
|
bccRecipients: bcc,
|
||||||
|
authStatus: authStatus,
|
||||||
} as SpamTrainMailDatum)
|
} as SpamTrainMailDatum)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -99,6 +108,11 @@ o.spec("SpamClassifierTest", () => {
|
||||||
isSpam: true,
|
isSpam: true,
|
||||||
isSpamConfidence: 1,
|
isSpamConfidence: 1,
|
||||||
ownerGroup: "owner",
|
ownerGroup: "owner",
|
||||||
|
sender: "",
|
||||||
|
toRecipients: "",
|
||||||
|
ccRecipients: "",
|
||||||
|
bccRecipients: "",
|
||||||
|
authStatus: "",
|
||||||
}
|
}
|
||||||
const layersModel = object<Sequential>()
|
const layersModel = object<Sequential>()
|
||||||
spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, false)
|
spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, false)
|
||||||
|
|
@ -119,6 +133,11 @@ o.spec("SpamClassifierTest", () => {
|
||||||
isSpam: false,
|
isSpam: false,
|
||||||
isSpamConfidence: 0,
|
isSpamConfidence: 0,
|
||||||
ownerGroup: "owner",
|
ownerGroup: "owner",
|
||||||
|
sender: "",
|
||||||
|
toRecipients: "",
|
||||||
|
ccRecipients: "",
|
||||||
|
bccRecipients: "",
|
||||||
|
authStatus: "",
|
||||||
}
|
}
|
||||||
|
|
||||||
const layersModel = object<Sequential>()
|
const layersModel = object<Sequential>()
|
||||||
|
|
@ -165,6 +184,11 @@ o.spec("SpamClassifierTest", () => {
|
||||||
const classifier = new SpamClassifier(object(), object(), object())
|
const classifier = new SpamClassifier(object(), object(), object())
|
||||||
const mail = {
|
const mail = {
|
||||||
subject: `Sample Tokens and values`,
|
subject: `Sample Tokens and values`,
|
||||||
|
sender: "sender",
|
||||||
|
toRecipients: "toRecipients",
|
||||||
|
ccRecipients: "ccRecipients",
|
||||||
|
bccRecipients: "bccRecipients",
|
||||||
|
authStatus: "authStatus",
|
||||||
// prettier-ignore
|
// prettier-ignore
|
||||||
body: `Hello, these are my MAC Address
|
body: `Hello, these are my MAC Address
|
||||||
FB-94-77-45-96-74
|
FB-94-77-45-96-74
|
||||||
|
|
@ -228,8 +252,8 @@ o.spec("SpamClassifierTest", () => {
|
||||||
Special Characters
|
Special Characters
|
||||||
!
|
!
|
||||||
@
|
@
|
||||||
Not Special Characters
|
Not Special Character
|
||||||
]
|
§
|
||||||
Number Sequences:
|
Number Sequences:
|
||||||
26098375
|
26098375
|
||||||
IBAN: DE91 1002 0370 0320 2239 82
|
IBAN: DE91 1002 0370 0320 2239 82
|
||||||
|
|
@ -252,84 +276,90 @@ this text is shown
|
||||||
} as SpamTrainMailDatum
|
} as SpamTrainMailDatum
|
||||||
const preprocessedMail = classifier.preprocessMail(mail)
|
const preprocessedMail = classifier.preprocessMail(mail)
|
||||||
// prettier-ignore
|
// prettier-ignore
|
||||||
const expectedOutput = `Sample Tokens and values Hello <SPECIAL-CHAR> these are my MAC Address
|
const expectedOutput = `Sample Tokens and values
|
||||||
\t\t\t\tFB <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
Hello TSPECIALCHAR these are my MAC Address
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> -D5 <SPECIAL-CHAR> <NUMBER> -7C
|
\t\t\t\tFB TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\tB4 <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> -2A-DE-D4
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR D5 TSPECIALCHAR TNUMBER TSPECIALCHAR 7C
|
||||||
|
\t\t\t\tB4 TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR 2A TSPECIALCHAR DE TSPECIALCHAR D4
|
||||||
\t\t\t\talong with my ISBNs
|
\t\t\t\talong with my ISBNs
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\t <NUMBER> -X
|
\t\t\t\t TNUMBER TSPECIALCHAR X
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\tSSN
|
\t\t\t\tSSN
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\tSHAs
|
\t\t\t\tSHAs
|
||||||
\t\t\t\t585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
|
\t\t\t\t585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
|
||||||
\t\t\t\t7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
|
\t\t\t\t7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
|
||||||
\t\t\t\t769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
|
\t\t\t\t769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
|
||||||
\t\t\t\tPhone Numbers
|
\t\t\t\tPhone Numbers
|
||||||
\t\t\t\t <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\t <SPECIAL-CHAR> <NUMBER> <NUMBER> <NUMBER> <NUMBER>
|
\t\t\t\t TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\tVIN <SPECIAL-CHAR> Vehicle identification number <SPECIAL-CHAR>
|
\t\t\t\tVIN TSPECIALCHAR Vehicle identification number TSPECIALCHAR
|
||||||
\t\t\t\t3FADP4AJ3BM438397
|
\t\t\t\t3FADP4AJ3BM438397
|
||||||
\t\t\t\tWAULT64B82N564937
|
\t\t\t\tWAULT64B82N564937
|
||||||
\t\t\t\tGUIDs
|
\t\t\t\tGUIDs
|
||||||
\t\t\t\t781a9631 <SPECIAL-CHAR> <NUMBER> -4f9c-bb36-25c3364b754b
|
\t\t\t\t781a9631 TSPECIALCHAR TNUMBER TSPECIALCHAR 4f9c TSPECIALCHAR bb36 TSPECIALCHAR 25c3364b754b
|
||||||
\t\t\t\t325783d4-a64e-453b-85e6-ed4b2cd4c9bf
|
\t\t\t\t325783d4 TSPECIALCHAR a64e TSPECIALCHAR 453b TSPECIALCHAR 85e6 TSPECIALCHAR ed4b2cd4c9bf
|
||||||
\t\t\t\tHex Colors
|
\t\t\t\tHex Colors
|
||||||
\t\t\t\t <SPECIAL-CHAR> 2016c1
|
\t\t\t\t TSPECIALCHAR 2016c1
|
||||||
\t\t\t\t <SPECIAL-CHAR> c090a4
|
\t\t\t\t TSPECIALCHAR c090a4
|
||||||
\t\t\t\t <SPECIAL-CHAR> c855f5
|
\t\t\t\t TSPECIALCHAR c855f5
|
||||||
\t\t\t\t <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\tIPV4
|
\t\t\t\tIPV4
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\tOn Date <SPECIAL-CHAR>
|
\t\t\t\tOn Date TSPECIALCHAR
|
||||||
\t\t\t\t <DATE>
|
\t\t\t\t TDATE
|
||||||
\t\t\t\t <DATE>
|
\t\t\t\t TDATE
|
||||||
\t\t\t\tNot Date
|
\t\t\t\tNot Date
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\tURL
|
\t\t\t\tURL
|
||||||
\t\t\t\t <URL-tuta.com>
|
\t\t\t\t TURLtuta TSPECIALCHAR com
|
||||||
\t\t\t\t <URL-subdomain.microsoft.com>
|
\t\t\t\t TURLsubdomain TSPECIALCHAR microsoft TSPECIALCHAR com
|
||||||
\t\t\t\tNOT URL
|
\t\t\t\tNOT URL
|
||||||
\t\t\t\t <URL-tuta>
|
\t\t\t\t TURLtuta
|
||||||
\t\t\t\tMAIL
|
\t\t\t\tMAIL
|
||||||
\t\t\t\t <EMAIL>
|
\t\t\t\t TEMAIL
|
||||||
\t\t\t\t <EMAIL>
|
\t\t\t\t TEMAIL
|
||||||
\t\t\t\tCredit Card
|
\t\t\t\tCredit Card
|
||||||
\t\t\t\t <CREDIT-CARD>
|
\t\t\t\t TCREDITCARD
|
||||||
\t\t\t\t <CREDIT-CARD>
|
\t\t\t\t TCREDITCARD
|
||||||
\t\t\t\tNot Credit Card
|
\t\t\t\tNot Credit Card
|
||||||
\t\t\t\t <NUMBER> <NUMBER>
|
\t\t\t\t TNUMBER TNUMBER
|
||||||
\t\t\t\tBit Coin Address
|
\t\t\t\tBit Coin Address
|
||||||
\t\t\t\t <BITCOIN>
|
\t\t\t\t TBITCOIN
|
||||||
\t\t\t\t <BITCOIN>
|
\t\t\t\t TBITCOIN
|
||||||
\t\t\t\tNot BTC
|
\t\t\t\tNot BTC
|
||||||
\t\t\t\t5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
|
\t\t\t\t5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
|
||||||
\t\t\t\t1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
|
\t\t\t\t1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
|
||||||
\t\t\t\tSpecial Characters
|
\t\t\t\tSpecial Characters
|
||||||
\t\t\t\t <SPECIAL-CHAR>
|
\t\t\t\t TSPECIALCHAR
|
||||||
\t\t\t\t <SPECIAL-CHAR>
|
\t\t\t\t TSPECIALCHAR
|
||||||
\t\t\t\tNot Special Characters
|
\t\t\t\tNot Special Character
|
||||||
\t\t\t\t]
|
\t\t\t\t§
|
||||||
\t\t\t\tNumber Sequences <SPECIAL-CHAR>
|
\t\t\t\tNumber Sequences TSPECIALCHAR
|
||||||
\t\t\t\t <NUMBER>
|
\t\t\t\t TNUMBER
|
||||||
\t\t\t\tIBAN <SPECIAL-CHAR> DE91 <CREDIT-CARD> <NUMBER>
|
\t\t\t\tIBAN TSPECIALCHAR DE91 TCREDITCARD TNUMBER
|
||||||
\t\t\t\tNot Number Sequences
|
\t\t\t\tNot Number Sequences
|
||||||
\t\t\t\tSHLT116
|
\t\t\t\tSHLT116
|
||||||
\t\t\t\tgb <SPECIAL-CHAR> 67ca4b
|
\t\t\t\tgb TSPECIALCHAR 67ca4b
|
||||||
\t\t\t\tOther values found in mails
|
\t\t\t\tOther values found in mails
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> € <NUMBER> m <NUMBER> Zi <NUMBER> <SPECIAL-CHAR>
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER € TNUMBER m TNUMBER Zi TNUMBER TSPECIALCHAR
|
||||||
\t\t\t\tFax <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> <NUMBER> <NUMBER> <NUMBER>
|
\t\t\t\tFax TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
|
||||||
\t\t\t\tAugust <NUMBER> <SPECIAL-CHAR> <NUMBER>
|
\t\t\t\tAugust TNUMBER TSPECIALCHAR TNUMBER
|
||||||
\t\t\t\t <NUMBER> <SPECIAL-CHAR> <NUMBER> PM <SPECIAL-CHAR> <NUMBER> <SPECIAL-CHAR> <NUMBER> PM
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER PM TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER PM
|
||||||
\t\t\t\tand all text on other lines it seems <SPECIAL-CHAR>
|
\t\t\t\tand all text on other lines it seems TSPECIALCHAR
|
||||||
Button Text
|
Button Text
|
||||||
this text is shown`
|
this text is shown
|
||||||
|
sender
|
||||||
|
toRecipients
|
||||||
|
ccRecipients
|
||||||
|
bccRecipients
|
||||||
|
authStatus`
|
||||||
o.check(preprocessedMail).equals(expectedOutput)
|
o.check(preprocessedMail).equals(expectedOutput)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
@ -357,8 +387,24 @@ this text is shown`
|
||||||
await spamClassifier.initialize("firstGroup")
|
await spamClassifier.initialize("firstGroup")
|
||||||
await spamClassifier.initialize("secondGroup")
|
await spamClassifier.initialize("secondGroup")
|
||||||
|
|
||||||
const isSpamFirstMail = await spamClassifier.predict({ subject: "", body: "", ownerGroup: "firstGroup" })
|
const commonSpamFields = {
|
||||||
const isSpamSecondMail = await spamClassifier.predict({ subject: "", body: "", ownerGroup: "secondGroup" })
|
subject: "",
|
||||||
|
body: "",
|
||||||
|
sender: "string",
|
||||||
|
toRecipients: "string",
|
||||||
|
ccRecipients: "string",
|
||||||
|
bccRecipients: "string",
|
||||||
|
authStatus: "",
|
||||||
|
}
|
||||||
|
|
||||||
|
const isSpamFirstMail = await spamClassifier.predict({
|
||||||
|
ownerGroup: "firstGroup",
|
||||||
|
...commonSpamFields,
|
||||||
|
})
|
||||||
|
const isSpamSecondMail = await spamClassifier.predict({
|
||||||
|
ownerGroup: "secondGroup",
|
||||||
|
...commonSpamFields,
|
||||||
|
})
|
||||||
|
|
||||||
o(isSpamFirstMail).equals(true)
|
o(isSpamFirstMail).equals(true)
|
||||||
o(isSpamSecondMail).equals(false)
|
o(isSpamSecondMail).equals(false)
|
||||||
|
|
@ -434,48 +480,6 @@ if (DO_RUN_PERFORMANCE_ANALYSIS) {
|
||||||
let retrainCount = 0
|
let retrainCount = 0
|
||||||
let predictedSpam = false
|
let predictedSpam = false
|
||||||
while (!predictedSpam && retrainCount++ <= 3) {
|
while (!predictedSpam && retrainCount++ <= 3) {
|
||||||
// await copiedClassifier.updateModel([{ ...sample, isSpam: false }])
|
|
||||||
|
|
||||||
/*
|
|
||||||
isSpamConfidence: 2
|
|
||||||
[
|
|
||||||
3, 2, 1, 3, 1,
|
|
||||||
1, 3, 2, 1, 5
|
|
||||||
] = 22
|
|
||||||
isSpamConfidence: 3
|
|
||||||
[
|
|
||||||
2, 5, 1, 2, 1,
|
|
||||||
1, 1, 2, 1, 2
|
|
||||||
] = 18
|
|
||||||
|
|
||||||
isSpamConfidence: 4
|
|
||||||
[
|
|
||||||
1, 1, 1, 2, 5,
|
|
||||||
1, 1, 1, 1, 5
|
|
||||||
] = 19
|
|
||||||
Retraining finished. Took: 477ms
|
|
||||||
Retraining finished. Took: 1259ms
|
|
||||||
predicted new mail to be with probability 0.46 spam
|
|
||||||
Retraining finished. Took: 560ms
|
|
||||||
Retraining finished. Took: 1273ms
|
|
||||||
|
|
||||||
isSpamConfidence: 8
|
|
||||||
Retraining finished. Took: 486ms
|
|
||||||
Retraining finished. Took: 2289ms
|
|
||||||
predicted new mail to be with probability 0.82 spam
|
|
||||||
Retraining finished. Took: 580ms
|
|
||||||
Retraining finished. Took: 2356ms
|
|
||||||
predicted new mail to be with probability 1.00 spam
|
|
||||||
Retraining finished. Took: 556ms
|
|
||||||
Retraining finished. Took: 2357ms
|
|
||||||
predicted new mail to be with probability 0.52 spam
|
|
||||||
[
|
|
||||||
1, 1, 1, 1, 1,
|
|
||||||
1, 1, 1, 1, 1
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
*/
|
|
||||||
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true, isSpamConfidence: 1 }])
|
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true, isSpamConfidence: 1 }])
|
||||||
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
|
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,12 +6,14 @@ import {
|
||||||
BodyTypeRef,
|
BodyTypeRef,
|
||||||
ClientSpamClassifierResultTypeRef,
|
ClientSpamClassifierResultTypeRef,
|
||||||
Mail,
|
Mail,
|
||||||
|
MailAddressTypeRef,
|
||||||
MailDetails,
|
MailDetails,
|
||||||
MailDetailsBlob,
|
MailDetailsBlob,
|
||||||
MailDetailsBlobTypeRef,
|
MailDetailsBlobTypeRef,
|
||||||
MailDetailsTypeRef,
|
MailDetailsTypeRef,
|
||||||
MailFolderTypeRef,
|
MailFolderTypeRef,
|
||||||
MailTypeRef,
|
MailTypeRef,
|
||||||
|
RecipientsTypeRef,
|
||||||
} from "../../../src/common/api/entities/tutanota/TypeRefs.js"
|
} from "../../../src/common/api/entities/tutanota/TypeRefs.js"
|
||||||
import { EntityClient } from "../../../src/common/api/common/EntityClient.js"
|
import { EntityClient } from "../../../src/common/api/common/EntityClient.js"
|
||||||
import { EntityRestClientMock } from "../api/worker/rest/EntityRestClientMock.js"
|
import { EntityRestClientMock } from "../api/worker/rest/EntityRestClientMock.js"
|
||||||
|
|
@ -140,6 +142,14 @@ o.spec("MailModelTest", function () {
|
||||||
mailDetails = createTestEntity(MailDetailsTypeRef, {
|
mailDetails = createTestEntity(MailDetailsTypeRef, {
|
||||||
_id: "mailDetail",
|
_id: "mailDetail",
|
||||||
body: createTestEntity(BodyTypeRef, { text: "some text" }),
|
body: createTestEntity(BodyTypeRef, { text: "some text" }),
|
||||||
|
recipients: createTestEntity(RecipientsTypeRef, {
|
||||||
|
toRecipients: [
|
||||||
|
createTestEntity(MailAddressTypeRef, {
|
||||||
|
name: "Recipient",
|
||||||
|
address: "recipient@tuta.com",
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
}),
|
||||||
})
|
})
|
||||||
mail = createTestEntity(MailTypeRef, {
|
mail = createTestEntity(MailTypeRef, {
|
||||||
_id: ["mailListId", "mailId"],
|
_id: ["mailListId", "mailId"],
|
||||||
|
|
@ -147,7 +157,9 @@ o.spec("MailModelTest", function () {
|
||||||
mailDetails: ["detailsList", mailDetails._id],
|
mailDetails: ["detailsList", mailDetails._id],
|
||||||
subject: "subject",
|
subject: "subject",
|
||||||
sets: [inboxFolder._id],
|
sets: [inboxFolder._id],
|
||||||
|
sender: createTestEntity(MailAddressTypeRef, { name: "Sender", address: "sender@tuta.com" }),
|
||||||
processingState: ProcessingState.INBOX_RULE_NOT_PROCESSED,
|
processingState: ProcessingState.INBOX_RULE_NOT_PROCESSED,
|
||||||
|
authStatus: "0",
|
||||||
})
|
})
|
||||||
const mailDetailsBlob: MailDetailsBlob = createTestEntity(MailDetailsBlobTypeRef, {
|
const mailDetailsBlob: MailDetailsBlob = createTestEntity(MailDetailsBlobTypeRef, {
|
||||||
_id: mail.mailDetails!,
|
_id: mail.mailDetails!,
|
||||||
|
|
@ -296,6 +308,11 @@ o.spec("MailModelTest", function () {
|
||||||
subject: "subject",
|
subject: "subject",
|
||||||
isSpam: false,
|
isSpam: false,
|
||||||
isSpamConfidence: 1,
|
isSpamConfidence: 1,
|
||||||
|
sender: "Sender sender@tuta.com",
|
||||||
|
toRecipients: "Recipient recipient@tuta.com",
|
||||||
|
ccRecipients: "",
|
||||||
|
bccRecipients: "",
|
||||||
|
authStatus: "TAUTHENTICATED",
|
||||||
}
|
}
|
||||||
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
|
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
|
||||||
verify(spamClassifier.predict(anything()), { times: 0 })
|
verify(spamClassifier.predict(anything()), { times: 0 })
|
||||||
|
|
@ -321,6 +338,11 @@ o.spec("MailModelTest", function () {
|
||||||
subject: "subject",
|
subject: "subject",
|
||||||
isSpam: false,
|
isSpam: false,
|
||||||
isSpamConfidence: 1,
|
isSpamConfidence: 1,
|
||||||
|
sender: "Sender sender@tuta.com",
|
||||||
|
toRecipients: "Recipient recipient@tuta.com",
|
||||||
|
ccRecipients: "",
|
||||||
|
bccRecipients: "",
|
||||||
|
authStatus: "TAUTHENTICATED",
|
||||||
}
|
}
|
||||||
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
|
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
|
||||||
verify(spamClassifier.predict(anything()), { times: 1 })
|
verify(spamClassifier.predict(anything()), { times: 1 })
|
||||||
|
|
@ -352,6 +374,11 @@ o.spec("MailModelTest", function () {
|
||||||
subject: "subject",
|
subject: "subject",
|
||||||
isSpam: false,
|
isSpam: false,
|
||||||
isSpamConfidence: 1,
|
isSpamConfidence: 1,
|
||||||
|
sender: "Sender sender@tuta.com",
|
||||||
|
toRecipients: "Recipient recipient@tuta.com",
|
||||||
|
ccRecipients: "",
|
||||||
|
bccRecipients: "",
|
||||||
|
authStatus: "TAUTHENTICATED",
|
||||||
}
|
}
|
||||||
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
|
verify(spamClassifier.storeSpamClassification(expectedSpamTrainMailDatum), { times: 1 })
|
||||||
verify(spamClassifier.predict(anything()), { times: 1 })
|
verify(spamClassifier.predict(anything()), { times: 1 })
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue