mirror of
https://github.com/tutao/tutanota.git
synced 2025-12-08 06:09:50 +00:00
remove DynamicTfVectorizer
Co-authored-by: map <mpfau@users.noreply.github.com>
This commit is contained in:
parent
6d4a656a69
commit
5124985d4f
4 changed files with 1 additions and 189 deletions
|
|
@ -1,112 +0,0 @@
|
||||||
const DEFAULT_TOKEN_VOCABULARY_LIMIT = 5000
|
|
||||||
const DEFAULT_USE_TOKEN_STEMMING = true
|
|
||||||
|
|
||||||
export type Stats = {
|
|
||||||
/// This is the lowest frequency of word found in current vocabulary,
|
|
||||||
/// which means all included vocabulary item have frequency higher than or equal to this frequency.
|
|
||||||
lowestIncludedFrequency: number
|
|
||||||
/// Out of total corpus of token vocabulary, these many items were excluded from vocabulary,
|
|
||||||
excludedTokenVocabularyCount: number
|
|
||||||
}
|
|
||||||
|
|
||||||
export class DynamicTfVectorizer {
|
|
||||||
readonly dimension: number
|
|
||||||
|
|
||||||
private stats: Stats | null = null
|
|
||||||
|
|
||||||
public constructor(
|
|
||||||
tokenVocabulary: Set<string>,
|
|
||||||
readonly useTokenStemming: boolean = DEFAULT_USE_TOKEN_STEMMING,
|
|
||||||
readonly tokenVocabularyLimit: number = DEFAULT_TOKEN_VOCABULARY_LIMIT,
|
|
||||||
) {
|
|
||||||
this.tokenVocabulary = tokenVocabulary
|
|
||||||
// we account for 50% more vocabulary than initially occupied
|
|
||||||
this.dimension = tokenVocabularyLimit + tokenVocabularyLimit * 0.5
|
|
||||||
}
|
|
||||||
|
|
||||||
private tokenVocabulary: Set<string>
|
|
||||||
|
|
||||||
public buildInitialTokenVocabulary(initialTokenizedMails: ReadonlyArray<ReadonlyArray<string>>) {
|
|
||||||
console.log(initialTokenizedMails)
|
|
||||||
|
|
||||||
const allTokenFrequencies = initialTokenizedMails.reduce((_, tokenizedMail) => this.getTokenFrequency(tokenizedMail), new Map<string, number>())
|
|
||||||
|
|
||||||
console.log(allTokenFrequencies)
|
|
||||||
|
|
||||||
const mostCommonTokens = Array.from(allTokenFrequencies.entries())
|
|
||||||
.sort((a, b) => b[1] - a[1])
|
|
||||||
.slice(0, this.tokenVocabularyLimit)
|
|
||||||
|
|
||||||
const lowestIncludedFrequency = mostCommonTokens[mostCommonTokens.length - 1][1]
|
|
||||||
const excludedTokenVocabularyCount = allTokenFrequencies.size - mostCommonTokens.length
|
|
||||||
|
|
||||||
console.log(mostCommonTokens)
|
|
||||||
|
|
||||||
this.tokenVocabulary = new Set(mostCommonTokens.map(([token, _frequency]) => token))
|
|
||||||
console.log(this.tokenVocabulary)
|
|
||||||
this.stats = { lowestIncludedFrequency, excludedTokenVocabularyCount }
|
|
||||||
}
|
|
||||||
|
|
||||||
private getTokenFrequency(tokenCollection: ReadonlyArray<string>, expandTokenVocabulary = false) {
|
|
||||||
const resultTokenFrequencyMap = new Map<string, number>()
|
|
||||||
for (let token of tokenCollection) {
|
|
||||||
if (this.useTokenStemming) {
|
|
||||||
//token = stemmer(token)
|
|
||||||
}
|
|
||||||
if (expandTokenVocabulary && !this.tokenVocabulary.has(token)) {
|
|
||||||
this.expandTokenVocabulary(token)
|
|
||||||
}
|
|
||||||
|
|
||||||
resultTokenFrequencyMap.set(token, (resultTokenFrequencyMap.get(token) || 0) + 1)
|
|
||||||
}
|
|
||||||
return resultTokenFrequencyMap
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Expand (add to) the token vocabulary with the new token.
|
|
||||||
*/
|
|
||||||
private expandTokenVocabulary(token: string) {
|
|
||||||
this.tokenVocabulary.add(token)
|
|
||||||
}
|
|
||||||
|
|
||||||
public transform(tokenizedMails: Array<ReadonlyArray<string>>): number[][] {
|
|
||||||
return this._transform(tokenizedMails, false)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* transform method to be used when refitting
|
|
||||||
* @returns: null in case a full retraining of the model is required
|
|
||||||
*/
|
|
||||||
public refitTransform(tokenizedMails: Array<ReadonlyArray<string>>): number[][] | null {
|
|
||||||
const transformResult = this._transform(tokenizedMails, true)
|
|
||||||
|
|
||||||
const availableSpace = this.dimension - this.tokenVocabulary.size
|
|
||||||
if (availableSpace <= 0) {
|
|
||||||
return null
|
|
||||||
} else {
|
|
||||||
return transformResult
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private _transform(tokenizedMails: Array<ReadonlyArray<string>>, expandTokenVocabulary = false): number[][] {
|
|
||||||
return tokenizedMails.map((tokenizedMail) => this.vectorize(tokenizedMail, expandTokenVocabulary))
|
|
||||||
}
|
|
||||||
|
|
||||||
// visibleForTesting
|
|
||||||
public vectorize(tokenizedMail: ReadonlyArray<string>, expandTokenVocabulary = false): number[] {
|
|
||||||
const tokenFrequencyMap = this.getTokenFrequency(tokenizedMail, expandTokenVocabulary)
|
|
||||||
|
|
||||||
let index = 0
|
|
||||||
let vector = new Array<number>(this.dimension).fill(0)
|
|
||||||
for (const [token, _] of this.tokenVocabulary.entries()) {
|
|
||||||
vector[index] = tokenFrequencyMap.get(token) ?? 0
|
|
||||||
index += 1
|
|
||||||
}
|
|
||||||
|
|
||||||
return vector
|
|
||||||
}
|
|
||||||
|
|
||||||
public getStats(): Stats | null {
|
|
||||||
return Object.seal(this.stats)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
import { assertWorkerOrNode } from "../../../common/api/common/Env"
|
import { assertWorkerOrNode } from "../../../common/api/common/Env"
|
||||||
import { assertNotNull, defer, groupByAndMap, isNotNull, Nullable, promiseMap, tokenize } from "@tutao/tutanota-utils"
|
import { assertNotNull, defer, groupByAndMap, isNotNull, Nullable, promiseMap, tokenize } from "@tutao/tutanota-utils"
|
||||||
import { DynamicTfVectorizer } from "./DynamicTfVectorizer"
|
|
||||||
import { HashingVectorizer } from "./HashingVectorizer"
|
import { HashingVectorizer } from "./HashingVectorizer"
|
||||||
import {
|
import {
|
||||||
ML_BITCOIN_REGEX,
|
ML_BITCOIN_REGEX,
|
||||||
|
|
@ -110,7 +109,7 @@ export class SpamClassifier {
|
||||||
private readonly initializer: SpamClassificationInitializer,
|
private readonly initializer: SpamClassificationInitializer,
|
||||||
private readonly deterministic: boolean = false,
|
private readonly deterministic: boolean = false,
|
||||||
private readonly preprocessConfiguration: PreprocessConfiguration = DEFAULT_PREPROCESS_CONFIGURATION,
|
private readonly preprocessConfiguration: PreprocessConfiguration = DEFAULT_PREPROCESS_CONFIGURATION,
|
||||||
private readonly vectorizer: DynamicTfVectorizer | HashingVectorizer = new HashingVectorizer(),
|
private readonly vectorizer: HashingVectorizer = new HashingVectorizer(),
|
||||||
) {
|
) {
|
||||||
this.classifier = new Map()
|
this.classifier = new Map()
|
||||||
}
|
}
|
||||||
|
|
@ -228,9 +227,6 @@ export class SpamClassifier {
|
||||||
const preprocessingTime = performance.now() - preprocessingStart
|
const preprocessingTime = performance.now() - preprocessingStart
|
||||||
|
|
||||||
const vectorizationStart = performance.now()
|
const vectorizationStart = performance.now()
|
||||||
if (this.vectorizer instanceof DynamicTfVectorizer) {
|
|
||||||
this.vectorizer.buildInitialTokenVocabulary(tokenizedMails)
|
|
||||||
}
|
|
||||||
|
|
||||||
const vectors = await this.vectorizer.transform(tokenizedMails)
|
const vectors = await this.vectorizer.transform(tokenizedMails)
|
||||||
const labels = mails.map((mail) => (mail.isSpam ? 1 : 0))
|
const labels = mails.map((mail) => (mail.isSpam ? 1 : 0))
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,6 @@ import "./api/worker/search/SuggestionFacadeTest.js"
|
||||||
import "./serviceworker/SwTest.js"
|
import "./serviceworker/SwTest.js"
|
||||||
import "./api/worker/facades/KeyVerificationFacadeTest.js"
|
import "./api/worker/facades/KeyVerificationFacadeTest.js"
|
||||||
import "./api/worker/utils/SleepDetectorTest.js"
|
import "./api/worker/utils/SleepDetectorTest.js"
|
||||||
import "./api/worker/utils/spamClassification/TfIdfVectorizerTest.js"
|
|
||||||
import "./api/worker/utils/spamClassification/HashingVectorizerTest.js"
|
import "./api/worker/utils/spamClassification/HashingVectorizerTest.js"
|
||||||
import "./api/worker/utils/spamClassification/PreprocessPatternsTest.js"
|
import "./api/worker/utils/spamClassification/PreprocessPatternsTest.js"
|
||||||
import "./calendar/AlarmSchedulerTest.js"
|
import "./calendar/AlarmSchedulerTest.js"
|
||||||
|
|
|
||||||
|
|
@ -1,71 +0,0 @@
|
||||||
import o from "@tutao/otest"
|
|
||||||
import { DynamicTfVectorizer } from "../../../../../../src/mail-app/workerUtils/spamClassification/DynamicTfVectorizer"
|
|
||||||
|
|
||||||
o.spec("DynamicTfVectorizer", () => {
|
|
||||||
const tokenize = (text: string): string[] =>
|
|
||||||
text
|
|
||||||
.toLowerCase()
|
|
||||||
.split(/\s+/)
|
|
||||||
.map((t) => t.replace(/[^a-z0-9-]/gi, "")) // remove punctuation
|
|
||||||
.filter((t) => t.length > 1)
|
|
||||||
|
|
||||||
const rawDocuments = [
|
|
||||||
"Tuta is an encrypted email service that prioritizes privacy and open-source principles.",
|
|
||||||
"With Tuta, your emails and contacts are stored securely using end-to-end encryption.",
|
|
||||||
"With Tuta, you can create a completely encrypted zero-knowledge calendar, try now!",
|
|
||||||
"Unlike traditional email providers, Tuta never collects user data or scans your messages.",
|
|
||||||
"Millions of people choose Tuta to protect their personal and professional communication.",
|
|
||||||
]
|
|
||||||
|
|
||||||
const tokenizedDocuments = rawDocuments.map(tokenize)
|
|
||||||
|
|
||||||
o("constructor throws if docIds and documents mismatch", () => {
|
|
||||||
// o(() => new DynamicTfVectorizer(["doc1"], [["token1"], ["token2"]])).throws(Error)
|
|
||||||
})
|
|
||||||
|
|
||||||
// o("builds correct vocabulary with filtered tokens", () => {
|
|
||||||
// const vectorizer = new DynamicTfVectorizer()
|
|
||||||
// vectorizer.initializeVocabulary(tokenizedDocuments)
|
|
||||||
// o(vectorizer.vocabulary.includes("tuta")).equals(true)
|
|
||||||
// o(vectorizer.vocabulary.includes("email")).equals(true)
|
|
||||||
// o(vectorizer.vocabulary.includes("a")).equals(false)
|
|
||||||
// })
|
|
||||||
|
|
||||||
// o("vectorize returns correct TF vector", () => {
|
|
||||||
// const vectorizer = new DynamicTfVectorizer()
|
|
||||||
// vectorizer.initializeVocabulary(tokenizedDocuments)
|
|
||||||
// const tokens = ["email", "encryption"]
|
|
||||||
// const vector = vectorizer.vectorize(tokens)
|
|
||||||
// o(vector.length).equals(vectorizer.featureVectorDimension)
|
|
||||||
//
|
|
||||||
// const emailIndex = vectorizer.vocabulary.includes("email")!
|
|
||||||
// const encryptionIndex = vectorizer.vocabulary.includes("encryption")!
|
|
||||||
// o(emailIndex).equals(true)
|
|
||||||
// o(encryptionIndex).equals(true)
|
|
||||||
// })
|
|
||||||
|
|
||||||
// o("transform returns correct tensor shape", () => {
|
|
||||||
// const vectorizer = new DynamicTfVectorizer()
|
|
||||||
// vectorizer.initializeVocabulary(tokenizedDocuments)
|
|
||||||
// const inputTokens = [
|
|
||||||
// ["privacy", "encryption"],
|
|
||||||
// ["user", "data"],
|
|
||||||
// ]
|
|
||||||
// const vector = vectorizer.transform(inputTokens)
|
|
||||||
//
|
|
||||||
// o(vector.length).equals(2)
|
|
||||||
// o(vector[0].length).equals(vectorizer.featureVectorDimension)
|
|
||||||
//
|
|
||||||
// const allZeros = Array.from(vector.flat()).every((v) => v === 0)
|
|
||||||
// o(allZeros).equals(false)
|
|
||||||
// })
|
|
||||||
|
|
||||||
// o("adds unknown words to vocabulary when still enough space", () => {
|
|
||||||
// const vectorizer = new DynamicTfVectorizer()
|
|
||||||
// vectorizer.initializeVocabulary(tokenizedDocuments)
|
|
||||||
// const tokens = ["hannover", "munich"]
|
|
||||||
// const vector = vectorizer.vectorize(tokens)
|
|
||||||
// const nonZero = vector.some((val) => val > 0)
|
|
||||||
// o(nonZero).equals(true)
|
|
||||||
// })
|
|
||||||
})
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue