2025-10-14 12:32:17 +02:00
|
|
|
import o from "@tutao/otest"
|
|
|
|
|
import fs from "node:fs"
|
|
|
|
|
import { parseCsv } from "../../../../../../src/common/misc/parsing/CsvParser"
|
|
|
|
|
import {
|
|
|
|
|
DEFAULT_PREPROCESS_CONFIGURATION,
|
|
|
|
|
SpamClassifier,
|
|
|
|
|
SpamTrainMailDatum,
|
|
|
|
|
} from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassifier"
|
|
|
|
|
import { OfflineStoragePersistence } from "../../../../../../src/mail-app/workerUtils/index/OfflineStoragePersistence"
|
|
|
|
|
import { matchers, object, when } from "testdouble"
|
|
|
|
|
import { assertNotNull, promiseMap } from "@tutao/tutanota-utils"
|
|
|
|
|
import { SpamClassificationInitializer } from "../../../../../../src/mail-app/workerUtils/spamClassification/SpamClassificationInitializer"
|
|
|
|
|
import { CacheStorage } from "../../../../../../src/common/api/worker/rest/DefaultEntityRestCache"
|
|
|
|
|
import { mockAttribute } from "@tutao/tutanota-test-utils"
|
|
|
|
|
import "@tensorflow/tfjs-backend-cpu"
|
|
|
|
|
import { HashingVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/HashingVectorizer"
|
|
|
|
|
import { LayersModel, tensor1d } from "../../../../../../src/mail-app/workerUtils/spamClassification/tensorflow-custom"
|
2025-10-14 12:11:22 +02:00
|
|
|
import { createTestEntity } from "../../../../TestUtils"
|
|
|
|
|
import { MailTypeRef } from "../../../../../../src/common/api/entities/tutanota/TypeRefs"
|
|
|
|
|
import { Sequential } from "@tensorflow/tfjs-layers"
|
2025-10-14 12:32:17 +02:00
|
|
|
|
2025-10-14 12:11:22 +02:00
|
|
|
const { anything } = matchers
|
2025-10-14 12:32:17 +02:00
|
|
|
export const DATASET_FILE_PATH: string = "./tests/api/worker/utils/spamClassification/spam_classification_test_mails.csv"
|
|
|
|
|
|
|
|
|
|
export async function readMailDataFromCSV(filePath: string): Promise<{
|
|
|
|
|
spamData: SpamTrainMailDatum[]
|
|
|
|
|
hamData: SpamTrainMailDatum[]
|
|
|
|
|
}> {
|
|
|
|
|
const file = await fs.promises.readFile(filePath)
|
|
|
|
|
const csv = parseCsv(file.toString())
|
|
|
|
|
|
|
|
|
|
let spamData: SpamTrainMailDatum[] = []
|
|
|
|
|
let hamData: SpamTrainMailDatum[] = []
|
|
|
|
|
for (const row of csv.rows.slice(1, csv.rows.length - 1)) {
|
|
|
|
|
const subject = row[8]
|
|
|
|
|
const body = row[10]
|
|
|
|
|
const label = row[11]
|
2025-10-22 16:18:24 +02:00
|
|
|
const from = row[0]
|
|
|
|
|
const to = row[1]
|
|
|
|
|
const cc = row[2]
|
|
|
|
|
const bcc = row[3]
|
|
|
|
|
const authStatus = row[4]
|
2025-10-14 12:32:17 +02:00
|
|
|
|
|
|
|
|
let isSpam = label === "spam" ? true : label === "ham" ? false : null
|
|
|
|
|
isSpam = assertNotNull(isSpam, "Unknown label detected: " + label)
|
|
|
|
|
const targetData = isSpam ? spamData : hamData
|
|
|
|
|
targetData.push({
|
|
|
|
|
mailId: ["mailListId", "mailElementId"],
|
|
|
|
|
subject,
|
|
|
|
|
body,
|
|
|
|
|
isSpam,
|
|
|
|
|
isSpamConfidence: 1,
|
|
|
|
|
ownerGroup: "owner",
|
2025-10-22 16:18:24 +02:00
|
|
|
sender: from,
|
|
|
|
|
toRecipients: to,
|
|
|
|
|
ccRecipients: cc,
|
|
|
|
|
bccRecipients: bcc,
|
|
|
|
|
authStatus: authStatus,
|
2025-10-14 12:32:17 +02:00
|
|
|
} as SpamTrainMailDatum)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return { spamData, hamData }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initial training (cutoff by day or amount)
|
2025-10-14 12:11:22 +02:00
|
|
|
o.spec("SpamClassifierTest", () => {
|
2025-10-14 12:32:17 +02:00
|
|
|
const mockOfflineStorageCache = object<CacheStorage>()
|
|
|
|
|
const mockOfflineStorage = object<OfflineStoragePersistence>()
|
|
|
|
|
const mockSpamClassificationInitializer = object<SpamClassificationInitializer>()
|
|
|
|
|
let nonEfficientSmallVectorizer: HashingVectorizer
|
|
|
|
|
let spamClassifier: SpamClassifier
|
|
|
|
|
|
|
|
|
|
let spamData: SpamTrainMailDatum[]
|
|
|
|
|
let hamData: SpamTrainMailDatum[]
|
|
|
|
|
let dataSlice: SpamTrainMailDatum[]
|
|
|
|
|
|
|
|
|
|
o.beforeEach(async () => {
|
|
|
|
|
const spamHamData = await readMailDataFromCSV(DATASET_FILE_PATH)
|
|
|
|
|
spamData = spamHamData.spamData
|
|
|
|
|
hamData = spamHamData.hamData
|
|
|
|
|
dataSlice = spamData.concat(hamData)
|
|
|
|
|
seededShuffle(dataSlice, 42)
|
|
|
|
|
|
|
|
|
|
mockSpamClassificationInitializer.init = async () => {
|
|
|
|
|
return dataSlice
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
nonEfficientSmallVectorizer = new HashingVectorizer(512)
|
|
|
|
|
spamClassifier = new SpamClassifier(
|
|
|
|
|
mockOfflineStorage,
|
|
|
|
|
mockOfflineStorageCache,
|
|
|
|
|
mockSpamClassificationInitializer,
|
|
|
|
|
true,
|
|
|
|
|
DEFAULT_PREPROCESS_CONFIGURATION,
|
|
|
|
|
nonEfficientSmallVectorizer,
|
|
|
|
|
)
|
|
|
|
|
})
|
|
|
|
|
|
2025-10-14 12:11:22 +02:00
|
|
|
o("processSpam maintains server classification when client classification is not enabled", async function () {
|
|
|
|
|
const mail = createTestEntity(MailTypeRef, {
|
|
|
|
|
_id: ["mailListId", "mailId"],
|
|
|
|
|
sets: [["folderList", "serverFolder"]],
|
|
|
|
|
})
|
|
|
|
|
const spamTrainMailDatum: SpamTrainMailDatum = {
|
|
|
|
|
mailId: mail._id,
|
|
|
|
|
subject: mail.subject,
|
|
|
|
|
body: "some body",
|
|
|
|
|
isSpam: true,
|
|
|
|
|
isSpamConfidence: 1,
|
|
|
|
|
ownerGroup: "owner",
|
2025-10-22 16:18:24 +02:00
|
|
|
sender: "",
|
|
|
|
|
toRecipients: "",
|
|
|
|
|
ccRecipients: "",
|
|
|
|
|
bccRecipients: "",
|
|
|
|
|
authStatus: "",
|
2025-10-14 12:11:22 +02:00
|
|
|
}
|
|
|
|
|
const layersModel = object<Sequential>()
|
|
|
|
|
spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, false)
|
|
|
|
|
|
|
|
|
|
const predictedSpam = await spamClassifier.predict(spamTrainMailDatum)
|
|
|
|
|
o(predictedSpam).equals(null)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("processSpam uses client classification when enabled", async function () {
|
|
|
|
|
const mail = createTestEntity(MailTypeRef, {
|
|
|
|
|
_id: ["mailListId", "mailId"],
|
|
|
|
|
sets: [["folderList", "serverFolder"]],
|
|
|
|
|
})
|
|
|
|
|
const spamTrainMailDatum: SpamTrainMailDatum = {
|
|
|
|
|
mailId: mail._id,
|
|
|
|
|
subject: mail.subject,
|
|
|
|
|
body: "some body",
|
|
|
|
|
isSpam: false,
|
|
|
|
|
isSpamConfidence: 0,
|
|
|
|
|
ownerGroup: "owner",
|
2025-10-22 16:18:24 +02:00
|
|
|
sender: "",
|
|
|
|
|
toRecipients: "",
|
|
|
|
|
ccRecipients: "",
|
|
|
|
|
bccRecipients: "",
|
|
|
|
|
authStatus: "",
|
2025-10-14 12:11:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const layersModel = object<Sequential>()
|
|
|
|
|
when(layersModel.predict(anything())).thenReturn(tensor1d([1]))
|
|
|
|
|
spamClassifier.addSpamClassifierForOwner(spamTrainMailDatum.ownerGroup, layersModel, true)
|
|
|
|
|
|
|
|
|
|
const predictedSpam = await spamClassifier.predict(spamTrainMailDatum)
|
|
|
|
|
o(predictedSpam).equals(true)
|
|
|
|
|
})
|
|
|
|
|
|
2025-10-14 12:32:17 +02:00
|
|
|
o("Initial training only", async () => {
|
|
|
|
|
o.timeout(20_000)
|
|
|
|
|
|
|
|
|
|
const trainTestSplit = dataSlice.length * 0.8
|
|
|
|
|
const trainSet = dataSlice.slice(0, trainTestSplit)
|
|
|
|
|
const testSet = dataSlice.slice(trainTestSplit)
|
|
|
|
|
|
|
|
|
|
await spamClassifier.initialTraining(trainSet)
|
2025-10-14 12:11:22 +02:00
|
|
|
await testClassifier(spamClassifier, testSet)
|
2025-10-14 12:32:17 +02:00
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("Initial training and refitting in multi step", async () => {
|
|
|
|
|
o.timeout(20_000)
|
|
|
|
|
|
|
|
|
|
const trainTestSplit = dataSlice.length * 0.8
|
|
|
|
|
const trainSet = dataSlice.slice(0, trainTestSplit)
|
|
|
|
|
const testSet = dataSlice.slice(trainTestSplit)
|
|
|
|
|
|
|
|
|
|
const trainSetFirstHalf = trainSet.slice(0, trainSet.length / 2)
|
|
|
|
|
const trainSetSecondHalf = trainSet.slice(trainSet.length / 2, trainSet.length)
|
|
|
|
|
|
|
|
|
|
dataSlice = trainSetFirstHalf
|
|
|
|
|
o(await mockSpamClassificationInitializer.init("owner")).deepEquals(trainSetFirstHalf)
|
|
|
|
|
await spamClassifier.initialTraining(dataSlice)
|
|
|
|
|
console.log(`==> Result when testing with mails in two steps (first step).`)
|
2025-10-14 12:11:22 +02:00
|
|
|
await testClassifier(spamClassifier, testSet)
|
2025-10-14 12:32:17 +02:00
|
|
|
|
|
|
|
|
await spamClassifier.updateModel("owner", trainSetSecondHalf)
|
|
|
|
|
console.log(`==> Result when testing with mails in two steps (second step).`)
|
2025-10-14 12:11:22 +02:00
|
|
|
await testClassifier(spamClassifier, testSet)
|
2025-10-14 12:32:17 +02:00
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("preprocessMail outputs expected tokens for mail content", async () => {
|
2025-10-14 12:11:22 +02:00
|
|
|
const classifier = new SpamClassifier(object(), object(), object())
|
2025-10-14 12:32:17 +02:00
|
|
|
const mail = {
|
|
|
|
|
subject: `Sample Tokens and values`,
|
2025-10-22 16:18:24 +02:00
|
|
|
sender: "sender",
|
|
|
|
|
toRecipients: "toRecipients",
|
|
|
|
|
ccRecipients: "ccRecipients",
|
|
|
|
|
bccRecipients: "bccRecipients",
|
|
|
|
|
authStatus: "authStatus",
|
2025-10-14 12:32:17 +02:00
|
|
|
// prettier-ignore
|
|
|
|
|
body: `Hello, these are my MAC Address
|
|
|
|
|
FB-94-77-45-96-74
|
|
|
|
|
91-58-81-D5-55-7C
|
|
|
|
|
B4-09-49-2A-DE-D4
|
|
|
|
|
along with my ISBNs
|
|
|
|
|
718385414-0
|
|
|
|
|
733065633-X
|
|
|
|
|
632756390-2
|
|
|
|
|
SSN
|
|
|
|
|
227-78-2283
|
|
|
|
|
134-34-1253
|
|
|
|
|
591-61-6459
|
|
|
|
|
SHAs
|
|
|
|
|
585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
|
|
|
|
|
7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
|
|
|
|
|
769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
|
|
|
|
|
Phone Numbers
|
|
|
|
|
(341) 2027690
|
|
|
|
|
+385 958 638 7625
|
|
|
|
|
430-284-9438
|
|
|
|
|
VIN (Vehicle identification number)
|
|
|
|
|
3FADP4AJ3BM438397
|
|
|
|
|
WAULT64B82N564937
|
|
|
|
|
GUIDs
|
|
|
|
|
781a9631-0716-4f9c-bb36-25c3364b754b
|
|
|
|
|
325783d4-a64e-453b-85e6-ed4b2cd4c9bf
|
|
|
|
|
Hex Colors
|
|
|
|
|
#2016c1
|
|
|
|
|
#c090a4
|
|
|
|
|
#c855f5
|
|
|
|
|
#000000
|
|
|
|
|
IPV4
|
|
|
|
|
91.17.182.120
|
|
|
|
|
47.232.175.0
|
|
|
|
|
171.90.3.93
|
|
|
|
|
On Date:
|
|
|
|
|
01-12-2023
|
|
|
|
|
1-12-2023
|
|
|
|
|
Not Date
|
|
|
|
|
2023/12-1
|
|
|
|
|
URL
|
|
|
|
|
https://tuta.com
|
|
|
|
|
https://subdomain.microsoft.com/outlook/test
|
|
|
|
|
NOT URL
|
|
|
|
|
https://tuta/com
|
|
|
|
|
MAIL
|
|
|
|
|
test@example.com
|
|
|
|
|
plus+addressing@example.com
|
|
|
|
|
Credit Card
|
|
|
|
|
5002355116026522
|
|
|
|
|
4041 3751 9030 3866
|
|
|
|
|
Not Credit Card
|
|
|
|
|
1234 1234
|
|
|
|
|
Bit Coin Address
|
|
|
|
|
159S1vV25PAxMiCVaErjPznbWB8YBvANAi
|
|
|
|
|
1NJmLtKTyHyqdKo6epyF9ecMyuH1xFWjEt
|
|
|
|
|
Not BTC
|
|
|
|
|
5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
|
|
|
|
|
1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
|
|
|
|
|
Special Characters
|
|
|
|
|
!
|
|
|
|
|
@
|
2025-10-22 16:18:24 +02:00
|
|
|
Not Special Character
|
|
|
|
|
§
|
2025-10-14 12:32:17 +02:00
|
|
|
Number Sequences:
|
|
|
|
|
26098375
|
|
|
|
|
IBAN: DE91 1002 0370 0320 2239 82
|
|
|
|
|
Not Number Sequences
|
|
|
|
|
SHLT116
|
|
|
|
|
gb_67ca4b
|
|
|
|
|
Other values found in mails
|
|
|
|
|
5.090 € 37 m 1 Zi 100%
|
|
|
|
|
Fax (089) 13 33 87 88
|
|
|
|
|
August 12, 2025
|
|
|
|
|
5:20 PM - 5:25 PM
|
|
|
|
|
<this gets removed by HTML as it should use < to represent the character>
|
|
|
|
|
and all text on other lines it seems.
|
|
|
|
|
<div>
|
|
|
|
|
<a rel="noopener noreferrer" target="_blank" href="https://www.somewebsite.de/?key=c2f395513421312029680" style="background-color:#055063;border-radius:3px;color:#ffffff;display:inline-block;font-size: 14px; font-family: sans-serif;font-weight:bold;line-height:36px;height:36px;text-align:center;text-decoration:none;width:157px;-webkit-text-size-adjust:none; margin-bottom:20px">Button Text</a>
|
|
|
|
|
</div>
|
|
|
|
|
<table cellpadding="0" cellspacing="0" border="0" role="presentation" width="100%"><tbody><tr><td align="center"><a href="https://mail.abc-web.de/optiext/optiextension.dll?ID=someid" rel="noopener noreferrer" target="_blank" style="text-decoration:none"><img id="OWATemporaryImageDivContainer1" src="https://mail.some-domain.de/images/SMC/grafik/image.png" alt="" border="0" class="" width="100%" style="max-width:100%;display:block;width:100%"></a></td></tr></tbody></table>
|
|
|
|
|
this text is shown
|
|
|
|
|
`,
|
|
|
|
|
} as SpamTrainMailDatum
|
|
|
|
|
const preprocessedMail = classifier.preprocessMail(mail)
|
|
|
|
|
// prettier-ignore
|
2025-10-22 16:18:24 +02:00
|
|
|
const expectedOutput = `Sample Tokens and values
|
|
|
|
|
Hello TSPECIALCHAR these are my MAC Address
|
|
|
|
|
\t\t\t\tFB TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR D5 TSPECIALCHAR TNUMBER TSPECIALCHAR 7C
|
|
|
|
|
\t\t\t\tB4 TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR 2A TSPECIALCHAR DE TSPECIALCHAR D4
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\talong with my ISBNs
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR X
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tSSN
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tSHAs
|
|
|
|
|
\t\t\t\t585eab9b3a5e4430e08f5096d636d0d475a8c69dae21a61c6f1b26c4bd8dd8c1
|
|
|
|
|
\t\t\t\t7233d153f2e0725d3d212d1f27f30258fafd72b286d07b3b1d94e7e3c35dce67
|
|
|
|
|
\t\t\t\t769f65bf44557df44fc5f99c014cbe98894107c9d7be0801f37c55b3776c3990
|
|
|
|
|
\t\t\t\tPhone Numbers
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\tVIN TSPECIALCHAR Vehicle identification number TSPECIALCHAR
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\t3FADP4AJ3BM438397
|
|
|
|
|
\t\t\t\tWAULT64B82N564937
|
|
|
|
|
\t\t\t\tGUIDs
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t781a9631 TSPECIALCHAR TNUMBER TSPECIALCHAR 4f9c TSPECIALCHAR bb36 TSPECIALCHAR 25c3364b754b
|
|
|
|
|
\t\t\t\t325783d4 TSPECIALCHAR a64e TSPECIALCHAR 453b TSPECIALCHAR 85e6 TSPECIALCHAR ed4b2cd4c9bf
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tHex Colors
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TSPECIALCHAR 2016c1
|
|
|
|
|
\t\t\t\t TSPECIALCHAR c090a4
|
|
|
|
|
\t\t\t\t TSPECIALCHAR c855f5
|
|
|
|
|
\t\t\t\t TSPECIALCHAR TNUMBER
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tIPV4
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\tOn Date TSPECIALCHAR
|
|
|
|
|
\t\t\t\t TDATE
|
|
|
|
|
\t\t\t\t TDATE
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tNot Date
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tURL
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TURLtuta TSPECIALCHAR com
|
|
|
|
|
\t\t\t\t TURLsubdomain TSPECIALCHAR microsoft TSPECIALCHAR com
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tNOT URL
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TURLtuta
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tMAIL
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TEMAIL
|
|
|
|
|
\t\t\t\t TEMAIL
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tCredit Card
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TCREDITCARD
|
|
|
|
|
\t\t\t\t TCREDITCARD
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tNot Credit Card
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TNUMBER TNUMBER
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tBit Coin Address
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TBITCOIN
|
|
|
|
|
\t\t\t\t TBITCOIN
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tNot BTC
|
|
|
|
|
\t\t\t\t5213nYwhhGw2qpNijzfnKcbCG4z3hnrVA
|
|
|
|
|
\t\t\t\t1OUm2eZK2ETeAo8v95WhZioQDy32YSerkD
|
|
|
|
|
\t\t\t\tSpecial Characters
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TSPECIALCHAR
|
|
|
|
|
\t\t\t\t TSPECIALCHAR
|
|
|
|
|
\t\t\t\tNot Special Character
|
|
|
|
|
\t\t\t\t§
|
|
|
|
|
\t\t\t\tNumber Sequences TSPECIALCHAR
|
|
|
|
|
\t\t\t\t TNUMBER
|
|
|
|
|
\t\t\t\tIBAN TSPECIALCHAR DE91 TCREDITCARD TNUMBER
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tNot Number Sequences
|
|
|
|
|
\t\t\t\tSHLT116
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\tgb TSPECIALCHAR 67ca4b
|
2025-10-14 12:32:17 +02:00
|
|
|
\t\t\t\tOther values found in mails
|
2025-10-22 16:18:24 +02:00
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER € TNUMBER m TNUMBER Zi TNUMBER TSPECIALCHAR
|
|
|
|
|
\t\t\t\tFax TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER TNUMBER TNUMBER TNUMBER
|
|
|
|
|
\t\t\t\tAugust TNUMBER TSPECIALCHAR TNUMBER
|
|
|
|
|
\t\t\t\t TNUMBER TSPECIALCHAR TNUMBER PM TSPECIALCHAR TNUMBER TSPECIALCHAR TNUMBER PM
|
|
|
|
|
\t\t\t\tand all text on other lines it seems TSPECIALCHAR
|
2025-10-14 12:32:17 +02:00
|
|
|
Button Text
|
2025-10-22 16:18:24 +02:00
|
|
|
this text is shown
|
|
|
|
|
sender
|
|
|
|
|
toRecipients
|
|
|
|
|
ccRecipients
|
|
|
|
|
bccRecipients
|
|
|
|
|
authStatus`
|
2025-10-14 12:32:17 +02:00
|
|
|
o.check(preprocessedMail).equals(expectedOutput)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("predict uses different models for different owner groups", async () => {
|
|
|
|
|
const firstGroupModel = object<LayersModel>()
|
|
|
|
|
const secondGroupModel = object<LayersModel>()
|
|
|
|
|
mockAttribute(spamClassifier, spamClassifier.loadModel, (ownerGroup) => {
|
|
|
|
|
if (ownerGroup === "firstGroup") {
|
|
|
|
|
return Promise.resolve(firstGroupModel)
|
|
|
|
|
} else if (ownerGroup === "secondGroup") {
|
|
|
|
|
return Promise.resolve(secondGroupModel)
|
|
|
|
|
}
|
|
|
|
|
return null
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
mockAttribute(spamClassifier, spamClassifier.updateAndSaveModel, () => {
|
|
|
|
|
return Promise.resolve()
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
const firstGroupReturnTensor = tensor1d([1.0], undefined)
|
|
|
|
|
when(firstGroupModel.predict(matchers.anything())).thenReturn(firstGroupReturnTensor)
|
|
|
|
|
const secondGroupReturnTensor = tensor1d([0.0], undefined)
|
|
|
|
|
when(secondGroupModel.predict(matchers.anything())).thenReturn(secondGroupReturnTensor)
|
|
|
|
|
|
|
|
|
|
await spamClassifier.initialize("firstGroup")
|
|
|
|
|
await spamClassifier.initialize("secondGroup")
|
|
|
|
|
|
2025-10-22 16:18:24 +02:00
|
|
|
const commonSpamFields = {
|
|
|
|
|
subject: "",
|
|
|
|
|
body: "",
|
|
|
|
|
sender: "string",
|
|
|
|
|
toRecipients: "string",
|
|
|
|
|
ccRecipients: "string",
|
|
|
|
|
bccRecipients: "string",
|
|
|
|
|
authStatus: "",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const isSpamFirstMail = await spamClassifier.predict({
|
|
|
|
|
ownerGroup: "firstGroup",
|
|
|
|
|
...commonSpamFields,
|
|
|
|
|
})
|
|
|
|
|
const isSpamSecondMail = await spamClassifier.predict({
|
|
|
|
|
ownerGroup: "secondGroup",
|
|
|
|
|
...commonSpamFields,
|
|
|
|
|
})
|
2025-10-14 12:32:17 +02:00
|
|
|
|
|
|
|
|
o(isSpamFirstMail).equals(true)
|
|
|
|
|
o(isSpamSecondMail).equals(false)
|
|
|
|
|
|
|
|
|
|
// manually dispose @tensorflow tensors to save memory
|
|
|
|
|
firstGroupReturnTensor.dispose()
|
|
|
|
|
secondGroupReturnTensor.dispose()
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// These are rather analysis instead of test
|
|
|
|
|
// They run in loop hence do take more time to finish and is not necessary to include in CI test suite
|
|
|
|
|
//
|
|
|
|
|
// To enable running this, change following constant to true
|
|
|
|
|
const DO_RUN_PERFORMANCE_ANALYSIS = false
|
|
|
|
|
if (DO_RUN_PERFORMANCE_ANALYSIS) {
|
|
|
|
|
o.spec("SpamClassifier - Performance Analysis", () => {
|
|
|
|
|
const mockOfflineStorageCache = object<CacheStorage>()
|
|
|
|
|
const mockOfflineStorage = object<OfflineStoragePersistence>()
|
2025-10-14 12:11:22 +02:00
|
|
|
let spamClassifier = object<SpamClassifier>()
|
2025-10-14 12:32:17 +02:00
|
|
|
let dataSlice: SpamTrainMailDatum[]
|
|
|
|
|
o.beforeEach(() => {
|
|
|
|
|
const mockSpamClassificationInitializer = object<SpamClassificationInitializer>()
|
|
|
|
|
mockSpamClassificationInitializer.init = async () => {
|
|
|
|
|
return dataSlice
|
|
|
|
|
}
|
2025-10-14 12:11:22 +02:00
|
|
|
spamClassifier = new SpamClassifier(mockOfflineStorage, mockOfflineStorageCache, mockSpamClassificationInitializer)
|
2025-10-14 12:32:17 +02:00
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("time to refit", async () => {
|
|
|
|
|
o.timeout(20_000_000)
|
|
|
|
|
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
|
|
|
|
|
const hamSlice = hamData.slice(0, 1000)
|
|
|
|
|
const spamSlice = spamData.slice(0, 400)
|
|
|
|
|
dataSlice = hamSlice.concat(spamSlice)
|
|
|
|
|
seededShuffle(dataSlice, 42)
|
|
|
|
|
|
|
|
|
|
const start = performance.now()
|
2025-10-14 12:11:22 +02:00
|
|
|
await spamClassifier.initialTraining(dataSlice)
|
2025-10-14 12:32:17 +02:00
|
|
|
const initialTrainingDuration = performance.now() - start
|
|
|
|
|
console.log(`initial training time ${initialTrainingDuration}ms`)
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < 20; i++) {
|
|
|
|
|
const nowSpam = [hamSlice[0]]
|
|
|
|
|
nowSpam.map((formerHam) => (formerHam.isSpam = true))
|
|
|
|
|
const retrainingStart = performance.now()
|
2025-10-14 12:11:22 +02:00
|
|
|
await spamClassifier.updateModel("owner", nowSpam)
|
2025-10-14 12:32:17 +02:00
|
|
|
const retrainingDuration = performance.now() - retrainingStart
|
|
|
|
|
console.log(`retraining time ${retrainingDuration}ms`)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("refit after moving a false negative classification multiple times", async () => {
|
|
|
|
|
o.timeout(20_000_000)
|
|
|
|
|
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
|
|
|
|
|
const hamSlice = hamData.slice(0, 100)
|
|
|
|
|
const spamSlice = spamData.slice(0, 10)
|
|
|
|
|
dataSlice = hamSlice.concat(spamSlice)
|
|
|
|
|
// seededShuffle(dataSlice, 42)
|
|
|
|
|
|
2025-10-14 12:11:22 +02:00
|
|
|
await spamClassifier.initialTraining(dataSlice)
|
2025-10-14 12:32:17 +02:00
|
|
|
const falseNegatives = spamData
|
|
|
|
|
.slice(10)
|
2025-10-14 12:11:22 +02:00
|
|
|
.filter(async (mailDatum) => mailDatum.isSpam !== (await spamClassifier.predict(mailDatum)))
|
2025-10-14 12:32:17 +02:00
|
|
|
.sort()
|
|
|
|
|
.slice(0, 10)
|
|
|
|
|
|
|
|
|
|
let retrainingNeeded = new Array<number>(falseNegatives.length).fill(0)
|
|
|
|
|
for (let i = 0; i < falseNegatives.length; i++) {
|
|
|
|
|
const sample = falseNegatives[i]
|
2025-10-14 12:11:22 +02:00
|
|
|
const copiedClassifier = await spamClassifier.cloneClassifier()
|
2025-10-14 12:32:17 +02:00
|
|
|
|
|
|
|
|
let retrainCount = 0
|
|
|
|
|
let predictedSpam = false
|
|
|
|
|
while (!predictedSpam && retrainCount++ <= 3) {
|
|
|
|
|
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true, isSpamConfidence: 1 }])
|
|
|
|
|
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
|
|
|
|
|
}
|
|
|
|
|
retrainingNeeded[i] = retrainCount
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(retrainingNeeded)
|
|
|
|
|
const maxRetrain = Math.max(...retrainingNeeded)
|
|
|
|
|
o.check(retrainingNeeded.length >= 10).equals(true)
|
|
|
|
|
o.check(maxRetrain < 3).equals(true)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("refit after moving a false positive classification multiple times", async () => {
|
|
|
|
|
o.timeout(20_000_000)
|
|
|
|
|
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
|
|
|
|
|
const hamSlice = hamData.slice(0, 10)
|
|
|
|
|
const spamSlice = spamData.slice(0, 100)
|
|
|
|
|
dataSlice = hamSlice.concat(spamSlice)
|
|
|
|
|
// seededShuffle(dataSlice, 42)
|
|
|
|
|
|
2025-10-14 12:11:22 +02:00
|
|
|
await spamClassifier.initialTraining(dataSlice)
|
2025-10-14 12:32:17 +02:00
|
|
|
const falsePositive = hamData
|
|
|
|
|
.slice(10)
|
2025-10-14 12:11:22 +02:00
|
|
|
.filter(async (mailDatum) => mailDatum.isSpam !== (await spamClassifier.predict(mailDatum)))
|
2025-10-14 12:32:17 +02:00
|
|
|
.slice(0, 10)
|
|
|
|
|
let retrainingNeeded = new Array<number>(falsePositive.length).fill(0)
|
|
|
|
|
for (let i = 0; i < falsePositive.length; i++) {
|
|
|
|
|
const sample = falsePositive[i]
|
2025-10-14 12:11:22 +02:00
|
|
|
const copiedClassifier = await spamClassifier.cloneClassifier()
|
2025-10-14 12:32:17 +02:00
|
|
|
|
|
|
|
|
let retrainCount = 0
|
|
|
|
|
let predictedSpam = false
|
|
|
|
|
while (!predictedSpam && retrainCount++ <= 10) {
|
|
|
|
|
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: true }])
|
|
|
|
|
await copiedClassifier.updateModel("owner", [{ ...sample, isSpam: false }])
|
|
|
|
|
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
|
|
|
|
|
}
|
|
|
|
|
retrainingNeeded[i] = retrainCount
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(retrainingNeeded)
|
|
|
|
|
const maxRetrain = Math.max(...retrainingNeeded)
|
|
|
|
|
o.check(retrainingNeeded.length >= 10).equals(true)
|
|
|
|
|
o.check(maxRetrain < 3).equals(true)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("retrain after moving a false negative classification multiple times", async () => {
|
|
|
|
|
o.timeout(20_000_000)
|
|
|
|
|
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
|
|
|
|
|
const hamSlice = hamData.slice(0, 100)
|
|
|
|
|
const spamSlice = spamData.slice(0, 10)
|
|
|
|
|
dataSlice = hamSlice.concat(spamSlice)
|
|
|
|
|
seededShuffle(dataSlice, 42)
|
|
|
|
|
|
2025-10-14 12:11:22 +02:00
|
|
|
await spamClassifier.initialTraining(dataSlice)
|
2025-10-14 12:32:17 +02:00
|
|
|
const falseNegatives = spamData
|
|
|
|
|
.slice(10)
|
2025-10-14 12:11:22 +02:00
|
|
|
.filter(async (mailDatum) => mailDatum.isSpam !== (await spamClassifier.predict(mailDatum)))
|
2025-10-14 12:32:17 +02:00
|
|
|
.slice(0, 10)
|
|
|
|
|
|
|
|
|
|
let retrainingNeeded = new Array<number>(falseNegatives.length).fill(0)
|
|
|
|
|
for (let i = 0; i < falseNegatives.length; i++) {
|
|
|
|
|
const sample = falseNegatives[i]
|
2025-10-14 12:11:22 +02:00
|
|
|
const copiedClassifier = await spamClassifier.cloneClassifier()
|
2025-10-14 12:32:17 +02:00
|
|
|
|
|
|
|
|
let retrainCount = 0
|
|
|
|
|
let predictedSpam = false
|
|
|
|
|
while (!predictedSpam && retrainCount++ <= 10) {
|
|
|
|
|
await copiedClassifier.initialTraining([...dataSlice, sample])
|
|
|
|
|
predictedSpam = assertNotNull(await copiedClassifier.predict(sample))
|
|
|
|
|
}
|
|
|
|
|
retrainingNeeded[i] = retrainCount
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(retrainingNeeded)
|
|
|
|
|
const maxRetrain = Math.max(...retrainingNeeded)
|
|
|
|
|
o.check(retrainingNeeded.length >= 10).equals(true)
|
|
|
|
|
o.check(maxRetrain < 3).equals(true)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
o("Time spent in vectorization during initial training", async () => {
|
|
|
|
|
o.timeout(2_000_000)
|
|
|
|
|
|
|
|
|
|
const ITERATION_COUNT: number = 1
|
|
|
|
|
const { spamData, hamData } = await readMailDataFromCSV(DATASET_FILE_PATH)
|
|
|
|
|
dataSlice = spamData.concat(hamData)
|
|
|
|
|
|
|
|
|
|
let trainingTimes = new Array<number>()
|
|
|
|
|
let vectorizationTimes = new Array<number>()
|
|
|
|
|
let trainingWithoutVectorization = new Array<number>()
|
|
|
|
|
|
|
|
|
|
await promiseMap(
|
|
|
|
|
new Array<number>(ITERATION_COUNT).fill(0),
|
|
|
|
|
async () => {
|
2025-10-14 12:11:22 +02:00
|
|
|
const { vectorizationTime, trainingTime } = await spamClassifier.initialTraining(dataSlice)
|
2025-10-14 12:32:17 +02:00
|
|
|
trainingTimes.push(trainingTime)
|
|
|
|
|
vectorizationTimes.push(vectorizationTime)
|
|
|
|
|
trainingWithoutVectorization.push(trainingTime - vectorizationTime)
|
|
|
|
|
},
|
|
|
|
|
{ concurrency: ITERATION_COUNT },
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
trainingTimes = trainingTimes.sort()
|
|
|
|
|
vectorizationTimes = vectorizationTimes.sort()
|
|
|
|
|
trainingWithoutVectorization = trainingWithoutVectorization.sort()
|
|
|
|
|
const avgTrainingTime = trainingTimes.reduce((a, b) => a + b, 0) / trainingTimes.length
|
|
|
|
|
const avgVectorizationTime = vectorizationTimes.reduce((a, b) => a + b, 0) / vectorizationTimes.length
|
|
|
|
|
const avgTrainingWithoutVectorization = trainingWithoutVectorization.reduce((a, b) => a + b, 0) / trainingWithoutVectorization.length
|
|
|
|
|
|
|
|
|
|
console.log("For vectorization:")
|
|
|
|
|
console.log({ min: vectorizationTimes.at(0), max: vectorizationTimes.at(-1), avg: avgVectorizationTime })
|
|
|
|
|
console.log("For whole training:")
|
|
|
|
|
console.log({ min: trainingTimes.at(0), max: trainingTimes.at(-1), avg: avgTrainingTime })
|
|
|
|
|
console.log("For training without vectorization:")
|
|
|
|
|
console.log({
|
|
|
|
|
min: trainingWithoutVectorization.at(0),
|
|
|
|
|
max: trainingWithoutVectorization.at(-1),
|
|
|
|
|
avg: avgTrainingWithoutVectorization,
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
}
|
2025-10-14 12:11:22 +02:00
|
|
|
|
|
|
|
|
async function testClassifier(classifier: SpamClassifier, mails: SpamTrainMailDatum[]): Promise<void> {
|
|
|
|
|
let predictionArray: number[] = []
|
|
|
|
|
for (let mail of mails) {
|
|
|
|
|
const prediction = await classifier.predict(mail)
|
|
|
|
|
predictionArray.push(prediction ? 1 : 0)
|
|
|
|
|
}
|
|
|
|
|
const ysArray = mails.map((mail) => mail.isSpam)
|
|
|
|
|
|
|
|
|
|
let tp = 0,
|
|
|
|
|
tn = 0,
|
|
|
|
|
fp = 0,
|
|
|
|
|
fn = 0
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < predictionArray.length; i++) {
|
|
|
|
|
const predictedSpam = predictionArray[i] > 0.5
|
|
|
|
|
const isActuallyASpam = ysArray[i]
|
|
|
|
|
if (predictedSpam && isActuallyASpam) tp++
|
|
|
|
|
else if (!predictedSpam && !isActuallyASpam) tn++
|
|
|
|
|
else if (predictedSpam && !isActuallyASpam) fp++
|
|
|
|
|
else if (!predictedSpam && isActuallyASpam) fn++
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const total = tp + tn + fp + fn
|
|
|
|
|
const accuracy = (tp + tn) / total
|
|
|
|
|
const precision = tp / (tp + fp + 1e-7)
|
|
|
|
|
const recall = tp / (tp + fn + 1e-7)
|
|
|
|
|
const f1 = 2 * ((precision * recall) / (precision + recall + 1e-7))
|
|
|
|
|
|
|
|
|
|
console.log("\n--- Evaluation Metrics ---")
|
|
|
|
|
console.log(`Accuracy: \t${(accuracy * 100).toFixed(2)}%`)
|
|
|
|
|
console.log(`Precision:\t${(precision * 100).toFixed(2)}%`)
|
|
|
|
|
console.log(`Recall: \t${(recall * 100).toFixed(2)}%`)
|
|
|
|
|
console.log(`F1 Score: \t${(f1 * 100).toFixed(2)}%`)
|
|
|
|
|
console.log("\nConfusion Matrix:")
|
|
|
|
|
console.log({
|
|
|
|
|
Predicted_Spam: { True_Positive: tp, False_Positive: fp },
|
|
|
|
|
Predicted_Ham: { False_Negative: fn, True_Negative: tn },
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-14 12:32:17 +02:00
|
|
|
// For testing, we need deterministic shuffling which is not provided by tf.util.shuffle(dataSlice)
|
|
|
|
|
// Seeded Fisher-Yates shuffle
|
|
|
|
|
function seededShuffle<T>(array: T[], seed: number): void {
|
|
|
|
|
const random = seededRandom(seed)
|
|
|
|
|
for (let i = array.length - 1; i > 0; i--) {
|
|
|
|
|
const j = Math.floor(random() * (i + 1))
|
|
|
|
|
;[array[i], array[j]] = [array[j], array[i]]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function seededRandom(seed: number): () => number {
|
|
|
|
|
const m = 0x80000000 // 2^31
|
|
|
|
|
const a = 1103515245
|
|
|
|
|
const c = 12345
|
|
|
|
|
|
|
|
|
|
let state = seed
|
|
|
|
|
|
|
|
|
|
return function (): number {
|
|
|
|
|
state = (a * state + c) % m
|
|
|
|
|
return state / m
|
|
|
|
|
}
|
|
|
|
|
}
|