tutanota/test/tests/api/worker/utils/spamClassification/HashingVectorizerTest.ts

import o from "@tutao/otest"
import { HashingVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/HashingVectorizer"
import { arrayEquals } from "@tutao/tutanota-utils"

export const tokenize = (text: string): string[] =>
	text
		.toLowerCase()
		.split(/\s+/)
		.filter((t) => t.length > 1)

o.spec("HashingVectorizer", () => {
	const rawDocuments = [
		"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
		"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
		"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
		"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
		"Millions of people choose Tuta to protect their personal and professional communication.",
	]

	const tokenizedDocuments = rawDocuments.map(tokenize)

	o("vectorize creates same vector for same tokens", async () => {
		const vectorizer = new HashingVectorizer()
		const tokens = ["privacy", "email", "data"]
		const v1 = await vectorizer.vectorize(tokens)
		const v2 = await vectorizer.vectorize(tokens)
		o(arrayEquals(v1, v2)).equals(true)
	})

	o("vectorize handles empty input", async () => {
		const vectorizer = new HashingVectorizer()
		const vector = await vectorizer.vectorize([])
		o(vector.every((v) => v === 0)).equals(true)
	})

	o("transform returns correct shape", async () => {
		const vectorizer = new HashingVectorizer()
		const tensor = await vectorizer.transform(tokenizedDocuments)

		o(tensor.length).equals(tokenizedDocuments.length)
		for (const vec of tensor) {
			o(vec.length).equals(vectorizer.dimension)
		}
	})
})