From 826342f001b0b74b11f281606797fe1250e98efe Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 5 Dec 2025 09:50:13 -0800 Subject: [PATCH] change opts for normalization, such as keeping www. and trailing slashes --- src/util/state.ts | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index fc6ca580..b3b51287 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -11,7 +11,7 @@ import { import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename, UploadResult } from "./storage.js"; -import normalizeUrl from "normalize-url"; +import normalizeUrl, { Options } from "normalize-url"; // ============================================================================ export enum LoadState { @@ -29,6 +29,20 @@ export enum QueueState { DUPE_URL = 2, } +// ============================================================================ +const normalizeOpts: Options = { + defaultProtocol: "https", + stripAuthentication: false, + stripTextFragment: false, + stripWWW: false, + stripHash: false, + removeTrailingSlash: false, + removeSingleSlash: false, + removeExplicitPort: true, + sortQueryParameters: true, + removePath: false, +}; + // ============================================================================ // treat 0 or 206 as 200 for purposes of dedup function normalizeDedupStatus(status: number): number { @@ -687,7 +701,7 @@ return inx; }: QueueEntry, limit = 0, ) { - url = normalizeUrl(url); + url = normalizeUrl(url, normalizeOpts); const added = this._timestamp(); const data: QueueEntry = { added, url, seedId, depth, extraHops }; @@ -1013,7 +1027,7 @@ return inx; } async addIfNoDupe(key: string, url: string, status: number) { - url = normalizeUrl(url); + url = normalizeUrl(url, normalizeOpts); return ( (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) === 1