mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
Merge branch 'add-normalize-url' into temp-dev
This commit is contained in:
commit
aff3179a3a
1 changed files with 17 additions and 3 deletions
|
|
@ -11,7 +11,7 @@ import {
|
|||
import { ScopedSeed } from "./seeds.js";
|
||||
import { Frame } from "puppeteer-core";
|
||||
import { interpolateFilename, UploadResult } from "./storage.js";
|
||||
import normalizeUrl from "normalize-url";
|
||||
import normalizeUrl, { Options } from "normalize-url";
|
||||
|
||||
// ============================================================================
|
||||
export enum LoadState {
|
||||
|
|
@ -29,6 +29,20 @@ export enum QueueState {
|
|||
DUPE_URL = 2,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
const normalizeOpts: Options = {
|
||||
defaultProtocol: "https",
|
||||
stripAuthentication: false,
|
||||
stripTextFragment: false,
|
||||
stripWWW: false,
|
||||
stripHash: false,
|
||||
removeTrailingSlash: false,
|
||||
removeSingleSlash: false,
|
||||
removeExplicitPort: true,
|
||||
sortQueryParameters: true,
|
||||
removePath: false,
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// treat 0 or 206 as 200 for purposes of dedup
|
||||
function normalizeDedupStatus(status: number): number {
|
||||
|
|
@ -687,7 +701,7 @@ return inx;
|
|||
}: QueueEntry,
|
||||
limit = 0,
|
||||
) {
|
||||
url = normalizeUrl(url);
|
||||
url = normalizeUrl(url, normalizeOpts);
|
||||
const added = this._timestamp();
|
||||
const data: QueueEntry = { added, url, seedId, depth, extraHops };
|
||||
|
||||
|
|
@ -1013,7 +1027,7 @@ return inx;
|
|||
}
|
||||
|
||||
async addIfNoDupe(key: string, url: string, status: number) {
|
||||
url = normalizeUrl(url);
|
||||
url = normalizeUrl(url, normalizeOpts);
|
||||
return (
|
||||
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
|
||||
1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue