remove DynamicTfVectorizer

Co-authored-by: map <mpfau@users.noreply.github.com>
2025-12-08 06:09:50 +00:00 · 2025-10-20 16:03:20 +02:00 · 2025-10-20 16:03:20 +02:00 · 5124985d4f
commit 5124985d4f
parent 6d4a656a69
4 changed files with 1 additions and 189 deletions
--- a/src/mail-app/workerUtils/spamClassification/DynamicTfVectorizer.ts
+++ b/src/mail-app/workerUtils/spamClassification/DynamicTfVectorizer.ts
@ -1,112 +0,0 @@
-const DEFAULT_TOKEN_VOCABULARY_LIMIT = 5000
-const DEFAULT_USE_TOKEN_STEMMING = true
-
-export type Stats = {
-	/// This is the lowest frequency of word found in current vocabulary,
-	/// which means all included vocabulary item have frequency higher than or equal to this frequency.
-	lowestIncludedFrequency: number
-	/// Out of total corpus of token vocabulary, these many items were excluded from vocabulary,
-	excludedTokenVocabularyCount: number
-}
-
-export class DynamicTfVectorizer {
-	readonly dimension: number
-
-	private stats: Stats | null = null
-
-	public constructor(
-		tokenVocabulary: Set<string>,
-		readonly useTokenStemming: boolean = DEFAULT_USE_TOKEN_STEMMING,
-		readonly tokenVocabularyLimit: number = DEFAULT_TOKEN_VOCABULARY_LIMIT,
-	) {
-		this.tokenVocabulary = tokenVocabulary
-		// we account for 50% more vocabulary than initially occupied
-		this.dimension = tokenVocabularyLimit + tokenVocabularyLimit * 0.5
-	}
-
-	private tokenVocabulary: Set<string>
-
-	public buildInitialTokenVocabulary(initialTokenizedMails: ReadonlyArray<ReadonlyArray<string>>) {
-		console.log(initialTokenizedMails)
-
-		const allTokenFrequencies = initialTokenizedMails.reduce((_, tokenizedMail) => this.getTokenFrequency(tokenizedMail), new Map<string, number>())
-
-		console.log(allTokenFrequencies)
-
-		const mostCommonTokens = Array.from(allTokenFrequencies.entries())
-			.sort((a, b) => b[1] - a[1])
-			.slice(0, this.tokenVocabularyLimit)
-
-		const lowestIncludedFrequency = mostCommonTokens[mostCommonTokens.length - 1][1]
-		const excludedTokenVocabularyCount = allTokenFrequencies.size - mostCommonTokens.length
-
-		console.log(mostCommonTokens)
-
-		this.tokenVocabulary = new Set(mostCommonTokens.map(([token, _frequency]) => token))
-		console.log(this.tokenVocabulary)
-		this.stats = { lowestIncludedFrequency, excludedTokenVocabularyCount }
-	}
-
-	private getTokenFrequency(tokenCollection: ReadonlyArray<string>, expandTokenVocabulary = false) {
-		const resultTokenFrequencyMap = new Map<string, number>()
-		for (let token of tokenCollection) {
-			if (this.useTokenStemming) {
-				//token = stemmer(token)
-			}
-			if (expandTokenVocabulary && !this.tokenVocabulary.has(token)) {
-				this.expandTokenVocabulary(token)
-			}
-
-			resultTokenFrequencyMap.set(token, (resultTokenFrequencyMap.get(token) || 0) + 1)
-		}
-		return resultTokenFrequencyMap
-	}
-
-	/**
-	 * Expand (add to) the token vocabulary with the new token.
-	 */
-	private expandTokenVocabulary(token: string) {
-		this.tokenVocabulary.add(token)
-	}
-
-	public transform(tokenizedMails: Array<ReadonlyArray<string>>): number[][] {
-		return this._transform(tokenizedMails, false)
-	}
-
-	/**
-	 * transform method to be used when refitting
-	 * @returns: null in case a full retraining of the model is required
-	 */
-	public refitTransform(tokenizedMails: Array<ReadonlyArray<string>>): number[][] | null {
-		const transformResult = this._transform(tokenizedMails, true)
-
-		const availableSpace = this.dimension - this.tokenVocabulary.size
-		if (availableSpace <= 0) {
-			return null
-		} else {
-			return transformResult
-		}
-	}
-
-	private _transform(tokenizedMails: Array<ReadonlyArray<string>>, expandTokenVocabulary = false): number[][] {
-		return tokenizedMails.map((tokenizedMail) => this.vectorize(tokenizedMail, expandTokenVocabulary))
-	}
-
-	// visibleForTesting
-	public vectorize(tokenizedMail: ReadonlyArray<string>, expandTokenVocabulary = false): number[] {
-		const tokenFrequencyMap = this.getTokenFrequency(tokenizedMail, expandTokenVocabulary)
-
-		let index = 0
-		let vector = new Array<number>(this.dimension).fill(0)
-		for (const [token, _] of this.tokenVocabulary.entries()) {
-			vector[index] = tokenFrequencyMap.get(token) ?? 0
-			index += 1
-		}
-
-		return vector
-	}
-
-	public getStats(): Stats | null {
-		return Object.seal(this.stats)
-	}
-}
--- a/src/mail-app/workerUtils/spamClassification/SpamClassifier.ts
+++ b/src/mail-app/workerUtils/spamClassification/SpamClassifier.ts
@ -1,6 +1,5 @@
 import { assertWorkerOrNode } from "../../../common/api/common/Env"
 import { assertNotNull, defer, groupByAndMap, isNotNull, Nullable, promiseMap, tokenize } from "@tutao/tutanota-utils"
-import { DynamicTfVectorizer } from "./DynamicTfVectorizer"
 import { HashingVectorizer } from "./HashingVectorizer"
 import {
 	ML_BITCOIN_REGEX,
@ -110,7 +109,7 @@ export class SpamClassifier {
 		private readonly initializer: SpamClassificationInitializer,
 		private readonly deterministic: boolean = false,
 		private readonly preprocessConfiguration: PreprocessConfiguration = DEFAULT_PREPROCESS_CONFIGURATION,
-		private readonly vectorizer: DynamicTfVectorizer | HashingVectorizer = new HashingVectorizer(),
+		private readonly vectorizer: HashingVectorizer = new HashingVectorizer(),
 	) {
 		this.classifier = new Map()
 	}
@ -228,9 +227,6 @@ export class SpamClassifier {
 		const preprocessingTime = performance.now() - preprocessingStart

 		const vectorizationStart = performance.now()
-		if (this.vectorizer instanceof DynamicTfVectorizer) {
-			this.vectorizer.buildInitialTokenVocabulary(tokenizedMails)
-		}

 		const vectors = await this.vectorizer.transform(tokenizedMails)
 		const labels = mails.map((mail) => (mail.isSpam ? 1 : 0))
--- a/test/tests/Suite.ts
+++ b/test/tests/Suite.ts
@ -82,7 +82,6 @@ import "./api/worker/search/SuggestionFacadeTest.js"
 import "./serviceworker/SwTest.js"
 import "./api/worker/facades/KeyVerificationFacadeTest.js"
 import "./api/worker/utils/SleepDetectorTest.js"
-import "./api/worker/utils/spamClassification/TfIdfVectorizerTest.js"
 import "./api/worker/utils/spamClassification/HashingVectorizerTest.js"
 import "./api/worker/utils/spamClassification/PreprocessPatternsTest.js"
 import "./calendar/AlarmSchedulerTest.js"
--- a/test/tests/api/worker/utils/spamClassification/TfIdfVectorizerTest.ts
+++ b/test/tests/api/worker/utils/spamClassification/TfIdfVectorizerTest.ts
@ -1,71 +0,0 @@
-import o from "@tutao/otest"
-import { DynamicTfVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/DynamicTfVectorizer"
-
-o.spec("DynamicTfVectorizer", () => {
-	const tokenize = (text: string): string[] =>
-		text
-			.toLowerCase()
-			.split(/\s+/)
-			.map((t) => t.replace(/[^a-z0-9-]/gi, "")) // remove punctuation
-			.filter((t) => t.length > 1)
-
-	const rawDocuments = [
-		"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
-		"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
-		"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
-		"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
-		"Millions of people choose Tuta to protect their personal and professional communication.",
-	]
-
-	const tokenizedDocuments = rawDocuments.map(tokenize)
-
-	o("constructor throws if docIds and documents mismatch", () => {
-		// o(() => new DynamicTfVectorizer(["doc1"], [["token1"], ["token2"]])).throws(Error)
-	})
-
-	// o("builds correct vocabulary with filtered tokens", () => {
-	// 	const vectorizer = new DynamicTfVectorizer()
-	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
-	// 	o(vectorizer.vocabulary.includes("tuta")).equals(true)
-	// 	o(vectorizer.vocabulary.includes("email")).equals(true)
-	// 	o(vectorizer.vocabulary.includes("a")).equals(false)
-	// })
-
-	// o("vectorize returns correct TF vector", () => {
-	// 	const vectorizer = new DynamicTfVectorizer()
-	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
-	// 	const tokens = ["email", "encryption"]
-	// 	const vector = vectorizer.vectorize(tokens)
-	// 	o(vector.length).equals(vectorizer.featureVectorDimension)
-	//
-	// 	const emailIndex = vectorizer.vocabulary.includes("email")!
-	// 	const encryptionIndex = vectorizer.vocabulary.includes("encryption")!
-	// 	o(emailIndex).equals(true)
-	// 	o(encryptionIndex).equals(true)
-	// })
-
-	// o("transform returns correct tensor shape", () => {
-	// 	const vectorizer = new DynamicTfVectorizer()
-	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
-	// 	const inputTokens = [
-	// 		["privacy", "encryption"],
-	// 		["user", "data"],
-	// 	]
-	// 	const vector = vectorizer.transform(inputTokens)
-	//
-	// 	o(vector.length).equals(2)
-	// 	o(vector[0].length).equals(vectorizer.featureVectorDimension)
-	//
-	// 	const allZeros = Array.from(vector.flat()).every((v) => v === 0)
-	// 	o(allZeros).equals(false)
-	// })
-
-	// o("adds unknown words to vocabulary when still enough space", () => {
-	// 	const vectorizer = new DynamicTfVectorizer()
-	// 	vectorizer.initializeVocabulary(tokenizedDocuments)
-	// 	const tokens = ["hannover", "munich"]
-	// 	const vector = vectorizer.vectorize(tokens)
-	// 	const nonZero = vector.some((val) => val > 0)
-	// 	o(nonZero).equals(true)
-	// })
-})