Include header fields as tokens in the anti-spam

Add the header fields(sender, toRecipients, ccRecipients, bccRecipients,
authStatus) to the anti-spam vectors. We also improve some of the
preprocessing steps and add offline migrations by deleting old spam
tables

Co-authored-by: amm@tutao.de
Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com>
This commit is contained in:
das 2025-10-22 16:18:24 +02:00 committed by abp
parent 21ad4ce2c3
commit f8bbd32695
No known key found for this signature in database
GPG key ID: 791D4EC38A7AA7C2
13 changed files with 10918 additions and 10788 deletions

View file

@ -54,12 +54,22 @@ export type SpamTrainMailDatum = {
isSpam: boolean
isSpamConfidence: number
ownerGroup: Id
sender: string
toRecipients: string
ccRecipients: string
bccRecipients: string
authStatus: string
}
export type SpamPredMailDatum = {
subject: string
body: string
ownerGroup: Id
sender: string
toRecipients: string
ccRecipients: string
bccRecipients: string
authStatus: string
}
const PREDICTION_THRESHOLD = 0.55
@ -223,9 +233,16 @@ export class SpamClassifier {
preprocessedMail = preprocessedMail.replaceAll(ML_SPACE_BEFORE_NEW_LINE_REGEX, ML_SPACE_BEFORE_NEW_LINE_TOKEN)
}
preprocessedMail += this.getHeaderFeatures(mail)
return preprocessedMail
}
private getHeaderFeatures(mail: SpamTrainMailDatum | SpamPredMailDatum): string {
const { sender, toRecipients, ccRecipients, bccRecipients, authStatus } = mail
return `\n${sender}\n${toRecipients}\n${ccRecipients}\n${bccRecipients}\n${authStatus}`
}
public async initialTraining(mails: SpamTrainMailDatum[]): Promise<TrainingPerformance> {
const preprocessingStart = performance.now()
const tokenizedMails = await promiseMap(mails, (mail) => spamClassifierTokenizer(this.preprocessMail(mail)))
@ -497,7 +514,8 @@ export class SpamClassifier {
private concatSubjectAndBody(mail: SpamTrainMailDatum | SpamPredMailDatum) {
const subject = mail.subject || ""
const body = mail.body || ""
const concatenated = `${subject} ${body}`.trim()
const concatenated = `${subject}\n${body}`.trim()
return concatenated.length > 0 ? concatenated : " "
}