mirror of
https://github.com/tutao/tutanota.git
synced 2025-12-08 06:09:50 +00:00
ensure all relevant clientSpamTrainingData is uploaded for mailbox
We want to make sure that all relevant clientSpamTrainingData is uploaded correctly for each mailbox. Previously, if clientSpamTrainingData was not empty for a mailbox, we would not upload more training data. This led to cases where users do only have a fraction of training data in comparison to mails available in their mailbox. We now check if the length of the already existing clientSpamTrainingData is smaller than the number of relevant mails for training when training from scratch. Co-authored-by: abp <abp@tutao.de>
This commit is contained in:
parent
aaa7535055
commit
ff856f821e
2 changed files with 129 additions and 20 deletions
|
|
@ -55,16 +55,21 @@ export class SpamClassificationDataDealer {
|
||||||
// clientSpamTrainingData is NOT cached
|
// clientSpamTrainingData is NOT cached
|
||||||
let clientSpamTrainingData = await this.entityClient.loadAll(ClientSpamTrainingDatumTypeRef, mailbox.clientSpamTrainingData)
|
let clientSpamTrainingData = await this.entityClient.loadAll(ClientSpamTrainingDatumTypeRef, mailbox.clientSpamTrainingData)
|
||||||
|
|
||||||
// if the training data is empty for this mailbox, we are aggregating
|
// if the clientSpamTrainingData is empty or does not include all relevant clientSpamTrainingData
|
||||||
// the last INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS of mails and uploading the training data
|
// for this mailbox, we are aggregating the last INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS of mails
|
||||||
if (isEmpty(clientSpamTrainingData)) {
|
// and upload the missing clientSpamTrainingDatum entries
|
||||||
console.log("building and uploading initial training data for mailbox: " + mailbox._id)
|
const allRelevantMailsInTrainingInterval = await this.fetchMailAndMailDetailsForMailbox(mailbox, mailSets)
|
||||||
const mailsWithMailDetails = await this.fetchMailAndMailDetailsForMailbox(mailbox, mailSets)
|
console.log(`mailbox ${mailbox._id} has total ${allRelevantMailsInTrainingInterval.length} relevant mails in training interval for spam classification`)
|
||||||
console.log(`mailbox has ${mailsWithMailDetails.length} mails suitable for encrypted training vector data upload`)
|
if (clientSpamTrainingData.length < allRelevantMailsInTrainingInterval.length) {
|
||||||
console.log(`vectorizing, compressing and encrypting those ${mailsWithMailDetails.length} mails...`)
|
const mailsToUpload = allRelevantMailsInTrainingInterval.filter((mail) => {
|
||||||
await this.uploadTrainingDataForMails(mailsWithMailDetails, mailbox, mailSets)
|
return !clientSpamTrainingData.some((datum) => isSameId(getElementId(mail.mail), getElementId(datum)))
|
||||||
|
})
|
||||||
|
console.log("building and uploading initial / new training data for mailbox: " + mailbox._id)
|
||||||
|
console.log(`mailbox ${mailbox._id} has ${mailsToUpload.length} new mails suitable for encrypted training vector data upload`)
|
||||||
|
console.log(`vectorizing, compressing and encrypting those ${mailsToUpload.length} mails... for mailbox ${mailbox._id}`)
|
||||||
|
await this.uploadTrainingDataForMails(mailsToUpload, mailbox, mailSets)
|
||||||
clientSpamTrainingData = await this.entityClient.loadAll(ClientSpamTrainingDatumTypeRef, mailbox.clientSpamTrainingData)
|
clientSpamTrainingData = await this.entityClient.loadAll(ClientSpamTrainingDatumTypeRef, mailbox.clientSpamTrainingData)
|
||||||
console.log(`clientSpamTrainingData list on the mailbox has ${clientSpamTrainingData.length} members.`)
|
console.log(`clientSpamTrainingData list on the mailbox ${mailbox._id} has ${clientSpamTrainingData.length} members.`)
|
||||||
}
|
}
|
||||||
|
|
||||||
const { subsampledTrainingData, hamCount, spamCount } = this.subsampleHamAndSpamMails(clientSpamTrainingData)
|
const { subsampledTrainingData, hamCount, spamCount } = this.subsampleHamAndSpamMails(clientSpamTrainingData)
|
||||||
|
|
@ -172,8 +177,8 @@ export class SpamClassificationDataDealer {
|
||||||
async fetchMailsByMailbagAfterDate(mailbag: MailBag, mailSets: MailFolder[], startDate: Date): Promise<Array<MailWithMailDetails>> {
|
async fetchMailsByMailbagAfterDate(mailbag: MailBag, mailSets: MailFolder[], startDate: Date): Promise<Array<MailWithMailDetails>> {
|
||||||
const bulkMailLoader = await this.bulkMailLoader()
|
const bulkMailLoader = await this.bulkMailLoader()
|
||||||
const mails = await this.entityClient.loadAll(MailTypeRef, mailbag.mails, timestampToGeneratedId(startDate.getTime()))
|
const mails = await this.entityClient.loadAll(MailTypeRef, mailbag.mails, timestampToGeneratedId(startDate.getTime()))
|
||||||
|
const trashFolder = assertNotNull(mailSets.find((set) => getMailSetKind(set) === MailSetKind.TRASH))
|
||||||
const filteredMails = mails.filter((mail) => {
|
const filteredMails = mails.filter((mail) => {
|
||||||
const trashFolder = assertNotNull(mailSets.find((set) => getMailSetKind(set) === MailSetKind.TRASH))
|
|
||||||
const isMailTrashed = mail.sets.some((setId) => isSameId(setId, trashFolder._id))
|
const isMailTrashed = mail.sets.some((setId) => isSameId(setId, trashFolder._id))
|
||||||
return isNotNull(mail.mailDetails) && !hasError(mail) && mail.receivedDate > startDate && !isMailTrashed
|
return isNotNull(mail.mailDetails) && !hasError(mail) && mail.receivedDate > startDate && !isMailTrashed
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -40,8 +40,13 @@ function createMailByFolderAndReceivedDate(mailId: IdTuple, mailSet: IdTuple, re
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
function createSpamTrainingDatumByConfidenceAndDecision(confidence: string, spamDecision: SpamDecision): ClientSpamTrainingDatum {
|
function createSpamTrainingDatumByConfidenceAndDecision(
|
||||||
|
confidence: string,
|
||||||
|
spamDecision: SpamDecision,
|
||||||
|
id: IdTuple = ["listId", "elementId"],
|
||||||
|
): ClientSpamTrainingDatum {
|
||||||
return createTestEntity(ClientSpamTrainingDatumTypeRef, {
|
return createTestEntity(ClientSpamTrainingDatumTypeRef, {
|
||||||
|
_id: id,
|
||||||
_ownerGroup: "group",
|
_ownerGroup: "group",
|
||||||
confidence,
|
confidence,
|
||||||
spamDecision,
|
spamDecision,
|
||||||
|
|
@ -153,14 +158,24 @@ o.spec("SpamClassificationDataDealer", () => {
|
||||||
o("uploads training data when clientSpamTrainingData is empty", async () => {
|
o("uploads training data when clientSpamTrainingData is empty", async () => {
|
||||||
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
|
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
|
||||||
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
|
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
|
||||||
const spamTrainingData = Array.from({ length: 10 }, () =>
|
const mails = Array.from({ length: 10 }, (_, index) =>
|
||||||
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST),
|
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "inboxMailId" + index], inboxFolder._id, new Date(), mailDetails._id),
|
||||||
).concat(Array.from({ length: 10 }, () => createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST)))
|
|
||||||
const mails = Array.from({ length: 10 }, () =>
|
|
||||||
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "inboxMailId"], inboxFolder._id, new Date(), mailDetails._id),
|
|
||||||
).concat(
|
).concat(
|
||||||
Array.from({ length: 10 }, () =>
|
Array.from({ length: 10 }, (_, index) =>
|
||||||
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "spamMailId"], spamFolder._id, new Date(), mailDetails._id),
|
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "spamMailId" + index], spamFolder._id, new Date(), mailDetails._id),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
const spamTrainingData = Array.from({ length: 10 }, (_, index) =>
|
||||||
|
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST, [
|
||||||
|
mailBox.clientSpamTrainingData!,
|
||||||
|
getElementId(mails[index]),
|
||||||
|
]),
|
||||||
|
).concat(
|
||||||
|
Array.from({ length: 10 }, (_, index) =>
|
||||||
|
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST, [
|
||||||
|
mailBox.clientSpamTrainingData!,
|
||||||
|
getElementId(mails[10 + index]),
|
||||||
|
]),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
|
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
|
||||||
|
|
@ -203,18 +218,106 @@ o.spec("SpamClassificationDataDealer", () => {
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
o("uploads training data when clientSpamTrainingData does not include all relevant mails", async () => {
|
||||||
|
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
|
||||||
|
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
|
||||||
|
|
||||||
|
const relevantMails = Array.from({ length: 40 }, (_, index) =>
|
||||||
|
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "inboxMailId" + index], inboxFolder._id, new Date(), mailDetails._id),
|
||||||
|
).concat(
|
||||||
|
Array.from({ length: 40 }, (_, index) =>
|
||||||
|
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "spamMailId" + index], spamFolder._id, new Date(), mailDetails._id),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
const existingSpamTrainingData = Array.from({ length: 20 }, (_, index) =>
|
||||||
|
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST, [
|
||||||
|
mailBox.clientSpamTrainingData!,
|
||||||
|
getElementId(relevantMails[index]),
|
||||||
|
]),
|
||||||
|
).concat(
|
||||||
|
Array.from({ length: 20 }, (_, index) =>
|
||||||
|
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST, [
|
||||||
|
mailBox.clientSpamTrainingData!,
|
||||||
|
getElementId(relevantMails[40 + index]),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
const updatedSpamTrainingData = Array.from({ length: 40 }, (_, index) =>
|
||||||
|
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST, [
|
||||||
|
mailBox.clientSpamTrainingData!,
|
||||||
|
getElementId(relevantMails[index]),
|
||||||
|
]),
|
||||||
|
).concat(
|
||||||
|
Array.from({ length: 40 }, (_, index) =>
|
||||||
|
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST, [
|
||||||
|
mailBox.clientSpamTrainingData!,
|
||||||
|
getElementId(relevantMails[40 + index]),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
const modifiedIndicesSinceStart = updatedSpamTrainingData.map((data) =>
|
||||||
|
createClientSpamTrainingDatumIndexEntryByClientSpamTrainingDatumElementId(getElementId(data)),
|
||||||
|
)
|
||||||
|
|
||||||
|
when(entityClientMock.loadAll(ClientSpamTrainingDatumTypeRef, mailBox.clientSpamTrainingData!)).thenResolve(
|
||||||
|
existingSpamTrainingData,
|
||||||
|
updatedSpamTrainingData,
|
||||||
|
)
|
||||||
|
when(entityClientMock.loadAll(MailTypeRef, mailBox.currentMailBag!.mails, anything())).thenResolve(relevantMails)
|
||||||
|
when(entityClientMock.loadAll(MailTypeRef, mailBox.archivedMailBags[0].mails, anything())).thenResolve([])
|
||||||
|
when(entityClientMock.loadAll(MailFolderTypeRef, mailBox.folders!.folders)).thenResolve([inboxFolder, spamFolder, trashFolder])
|
||||||
|
when(entityClientMock.loadAll(ClientSpamTrainingDatumIndexEntryTypeRef, mailBox.modifiedClientSpamTrainingDataIndex!)).thenResolve(
|
||||||
|
modifiedIndicesSinceStart,
|
||||||
|
)
|
||||||
|
|
||||||
|
when(bulkMailLoaderMock.loadMailDetails(relevantMails)).thenResolve(
|
||||||
|
relevantMails.map((mail) => {
|
||||||
|
return { mail, mailDetails }
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
const trainingDataset = await spamClassificationDataDealer.fetchAllTrainingData("owner")
|
||||||
|
|
||||||
|
// first load: empty, second load: fetch uploaded data
|
||||||
|
verify(entityClientMock.loadAll(ClientSpamTrainingDatumTypeRef, mailBox.clientSpamTrainingData!), { times: 2 })
|
||||||
|
verify(entityClientMock.loadAll(ClientSpamTrainingDatumIndexEntryTypeRef, mailBox.modifiedClientSpamTrainingDataIndex!), { times: 1 })
|
||||||
|
|
||||||
|
const expectedUploadMailsHam = relevantMails.slice(20, 40)
|
||||||
|
const expectedUploadMailsSpam = relevantMails.slice(60, 80)
|
||||||
|
|
||||||
|
const unencryptedPayload = expectedUploadMailsHam.concat(expectedUploadMailsSpam).map((mail) => {
|
||||||
|
return {
|
||||||
|
mailId: mail._id,
|
||||||
|
isSpam: isSameId(mail.sets[0], spamFolder._id),
|
||||||
|
confidence: DEFAULT_IS_SPAM_CONFIDENCE,
|
||||||
|
vector: new Uint8Array(1),
|
||||||
|
} as UnencryptedPopulateClientSpamTrainingDatum
|
||||||
|
})
|
||||||
|
verify(mailFacadeMock.populateClientSpamTrainingData("owner", unencryptedPayload), { times: 1 })
|
||||||
|
|
||||||
|
o(trainingDataset).deepEquals({
|
||||||
|
trainingData: updatedSpamTrainingData,
|
||||||
|
lastTrainingDataIndexId: getElementId(last(modifiedIndicesSinceStart)!),
|
||||||
|
hamCount: 40,
|
||||||
|
spamCount: 40,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
o("successfully returns training data with mixed ham/spam data", async () => {
|
o("successfully returns training data with mixed ham/spam data", async () => {
|
||||||
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
|
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
|
||||||
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
|
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
|
||||||
|
when(entityClientMock.loadAll(MailTypeRef, anything(), anything())).thenResolve([])
|
||||||
|
|
||||||
const spamTrainingData = Array.from({ length: 10 }, () =>
|
const spamTrainingData = Array.from({ length: 10 }, () =>
|
||||||
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST),
|
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST),
|
||||||
).concat(Array.from({ length: 10 }, () => createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST)))
|
).concat(Array.from({ length: 10 }, () => createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST)))
|
||||||
|
|
||||||
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
|
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
|
||||||
createClientSpamTrainingDatumIndexEntryByClientSpamTrainingDatumElementId(getElementId(data)),
|
createClientSpamTrainingDatumIndexEntryByClientSpamTrainingDatumElementId(getElementId(data)),
|
||||||
)
|
)
|
||||||
when(entityClientMock.loadAll(ClientSpamTrainingDatumTypeRef, mailBox.clientSpamTrainingData!)).thenResolve(spamTrainingData)
|
when(entityClientMock.loadAll(ClientSpamTrainingDatumTypeRef, mailBox.clientSpamTrainingData!)).thenResolve(spamTrainingData)
|
||||||
when(entityClientMock.loadAll(MailTypeRef, mailBox.archivedMailBags[0].mails, anything())).thenResolve([])
|
|
||||||
when(entityClientMock.loadAll(MailFolderTypeRef, mailBox.folders!.folders)).thenResolve([inboxFolder, spamFolder, trashFolder])
|
when(entityClientMock.loadAll(MailFolderTypeRef, mailBox.folders!.folders)).thenResolve([inboxFolder, spamFolder, trashFolder])
|
||||||
when(entityClientMock.loadAll(ClientSpamTrainingDatumIndexEntryTypeRef, mailBox.modifiedClientSpamTrainingDataIndex!)).thenResolve(
|
when(entityClientMock.loadAll(ClientSpamTrainingDatumIndexEntryTypeRef, mailBox.modifiedClientSpamTrainingDataIndex!)).thenResolve(
|
||||||
modifiedIndicesSinceStart,
|
modifiedIndicesSinceStart,
|
||||||
|
|
@ -241,6 +344,7 @@ o.spec("SpamClassificationDataDealer", () => {
|
||||||
const validSpamData = createSpamTrainingDatumByConfidenceAndDecision("4", SpamDecision.BLACKLIST)
|
const validSpamData = createSpamTrainingDatumByConfidenceAndDecision("4", SpamDecision.BLACKLIST)
|
||||||
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
|
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
|
||||||
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
|
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
|
||||||
|
when(entityClientMock.loadAll(MailTypeRef, anything(), anything())).thenResolve([])
|
||||||
|
|
||||||
const spamTrainingData = [noneDecisionData, zeroConfData, validSpamData, validHamData]
|
const spamTrainingData = [noneDecisionData, zeroConfData, validSpamData, validHamData]
|
||||||
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
|
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue