mirror of
https://github.com/tutao/tutanota.git
synced 2025-12-08 06:09:50 +00:00
Include header fields as tokens in the anti-spam
Add the header fields(sender, toRecipients, ccRecipients, bccRecipients, authStatus) to the anti-spam vectors. We also improve some of the preprocessing steps and add offline migrations by deleting old spam tables Co-authored-by: amm@tutao.de Co-authored-by: jhm <17314077+jomapp@users.noreply.github.com>
This commit is contained in:
parent
21ad4ce2c3
commit
f8bbd32695
13 changed files with 10918 additions and 10788 deletions
|
|
@ -54,12 +54,22 @@ export type SpamTrainMailDatum = {
|
|||
isSpam: boolean
|
||||
isSpamConfidence: number
|
||||
ownerGroup: Id
|
||||
sender: string
|
||||
toRecipients: string
|
||||
ccRecipients: string
|
||||
bccRecipients: string
|
||||
authStatus: string
|
||||
}
|
||||
|
||||
export type SpamPredMailDatum = {
|
||||
subject: string
|
||||
body: string
|
||||
ownerGroup: Id
|
||||
sender: string
|
||||
toRecipients: string
|
||||
ccRecipients: string
|
||||
bccRecipients: string
|
||||
authStatus: string
|
||||
}
|
||||
|
||||
const PREDICTION_THRESHOLD = 0.55
|
||||
|
|
@ -223,9 +233,16 @@ export class SpamClassifier {
|
|||
preprocessedMail = preprocessedMail.replaceAll(ML_SPACE_BEFORE_NEW_LINE_REGEX, ML_SPACE_BEFORE_NEW_LINE_TOKEN)
|
||||
}
|
||||
|
||||
preprocessedMail += this.getHeaderFeatures(mail)
|
||||
|
||||
return preprocessedMail
|
||||
}
|
||||
|
||||
private getHeaderFeatures(mail: SpamTrainMailDatum | SpamPredMailDatum): string {
|
||||
const { sender, toRecipients, ccRecipients, bccRecipients, authStatus } = mail
|
||||
return `\n${sender}\n${toRecipients}\n${ccRecipients}\n${bccRecipients}\n${authStatus}`
|
||||
}
|
||||
|
||||
public async initialTraining(mails: SpamTrainMailDatum[]): Promise<TrainingPerformance> {
|
||||
const preprocessingStart = performance.now()
|
||||
const tokenizedMails = await promiseMap(mails, (mail) => spamClassifierTokenizer(this.preprocessMail(mail)))
|
||||
|
|
@ -497,7 +514,8 @@ export class SpamClassifier {
|
|||
private concatSubjectAndBody(mail: SpamTrainMailDatum | SpamPredMailDatum) {
|
||||
const subject = mail.subject || ""
|
||||
const body = mail.body || ""
|
||||
const concatenated = `${subject} ${body}`.trim()
|
||||
const concatenated = `${subject}\n${body}`.trim()
|
||||
|
||||
return concatenated.length > 0 ? concatenated : " "
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue