Implement spam training data sync and add TutanotaModelV98

We sync the spam training data encrypted through our server to make
sure that all clients for a specific user behave the same when
classifying mails. Additionally, this enables the spam classification
in the webApp. We compress the training data vectors
(see clientSpamTrainingDatum) before uploading to our server using
SparseVectorCompressor.ts. When a user has the ClientSpamClassification
enabled, the spam training data sync will happen for every mail
received.

ClientSpamTrainingDatum are not stored in the CacheStorage.
No entityEvents are emitted for this type.
However, we retrieve creations and updates for ClientSpamTrainingData
through the modifiedClientSpamTrainingDataIndex.

We calculate a threshold per classifier based on the dataset ham to spam
ratio, we also subsample our training data to cap the ham to spam ratio
within a certain limit.

Co-authored-by: jomapp <17314077+jomapp@users.noreply.github.com>
Co-authored-by: das <das@tutao.de>
Co-authored-by: abp <abp@tutao.de>
Co-authored-by: Kinan <104761667+kibibytium@users.noreply.github.com>
Co-authored-by: sug <sug@tutao.de>
Co-authored-by: nif <nif@tutao.de>
Co-authored-by: map <mpfau@users.noreply.github.com>
This commit is contained in:
map 2025-11-03 18:01:36 +01:00 committed by abp
parent f8bbd32695
commit 5293be6a4a
No known key found for this signature in database
GPG key ID: 791D4EC38A7AA7C2
63 changed files with 3877 additions and 1963 deletions

View file

@ -1,7 +1,7 @@
import { BlobElementEntity, Entity, ListElementEntity, ServerModelParsedInstance, SomeEntity, TypeModel } from "../../common/EntityTypes.js"
import { customIdToBase64Url, ensureBase64Ext, firstBiggerThanSecond } from "../../common/utils/EntityUtils.js"
import { customIdToBase64Url, ensureBase64Ext, firstBiggerThanSecond, GENERATED_MIN_ID } from "../../common/utils/EntityUtils.js"
import { CacheStorage, LastUpdateTime } from "./DefaultEntityRestCache.js"
import { assertNotNull, clone, filterNull, getFromMap, getTypeString, Nullable, parseTypeString, remove, TypeRef } from "@tutao/tutanota-utils"
import { assertNotNull, clone, filterNull, getFromMap, getTypeString, newPromise, Nullable, parseTypeString, remove, TypeRef } from "@tutao/tutanota-utils"
import { CustomCacheHandlerMap } from "./cacheHandler/CustomCacheHandler.js"
import { Type as TypeId } from "../../common/EntityConstants.js"
import { ProgrammingError } from "../../common/error/ProgrammingError.js"
@ -10,6 +10,7 @@ import { ModelMapper } from "../crypto/ModelMapper"
import { ServerTypeModelResolver } from "../../common/EntityFunctions"
import { expandId } from "./RestClientIdUtils"
import { hasError } from "../../common/utils/ErrorUtils"
import { SpamClassificationModel } from "../../../../mail-app/workerUtils/spamClassification/SpamClassifier"
/** Cache for a single list. */
type ListCache = {
@ -41,8 +42,9 @@ export class EphemeralCacheStorage implements CacheStorage {
private readonly entities: Map<string, Map<Id, ServerModelParsedInstance>> = new Map()
private readonly lists: Map<string, ListTypeCache> = new Map()
private readonly blobEntities: Map<string, BlobElementTypeCache> = new Map()
private readonly spamClassificationModelCache: Map<Id, SpamClassificationModel> = new Map()
private lastUpdateTime: number | null = null
private lastTrainedTime: number | null = null
private lastTrainingDataId: Id = GENERATED_MIN_ID
private lastTrainedFromScratchTime: number | null = null
private userId: Id | null = null
private lastBatchIdPerGroup = new Map<Id, Id>()
@ -419,12 +421,12 @@ export class EphemeralCacheStorage implements CacheStorage {
this.lastUpdateTime = value
}
async getLastTrainedTime(): Promise<number> {
return this.lastTrainedTime ?? 0
async getLastTrainingDataIndexId(): Promise<Id> {
return this.lastTrainingDataId
}
async setLastTrainedTime(value: number): Promise<void> {
this.lastTrainedTime = value
async setLastTrainingDataIndexId(id: Id): Promise<void> {
this.lastTrainingDataId = id
}
async getLastTrainedFromScratchTime(): Promise<number> {
@ -435,6 +437,14 @@ export class EphemeralCacheStorage implements CacheStorage {
this.lastTrainedFromScratchTime = ms
}
async setSpamClassificationModel(model: SpamClassificationModel): Promise<void> {
this.spamClassificationModelCache.set(model.ownerGroup, model)
}
async getSpamClassificationModel(ownerGroup: Id): Promise<Nullable<SpamClassificationModel>> {
return this.spamClassificationModelCache.get(ownerGroup) ?? null
}
async getWholeList<T extends ListElementEntity>(typeRef: TypeRef<T>, listId: Id): Promise<Array<T>> {
const parsedInstances = await this.getWholeListParsed(typeRef, listId)
return await this.modelMapper.mapToInstances(typeRef, parsedInstances)