mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
Merge branch 'add-normalize-url' into temp-dev
This commit is contained in:
commit
aff3179a3a
1 changed files with 17 additions and 3 deletions
|
|
@ -11,7 +11,7 @@ import {
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { Frame } from "puppeteer-core";
|
import { Frame } from "puppeteer-core";
|
||||||
import { interpolateFilename, UploadResult } from "./storage.js";
|
import { interpolateFilename, UploadResult } from "./storage.js";
|
||||||
import normalizeUrl from "normalize-url";
|
import normalizeUrl, { Options } from "normalize-url";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export enum LoadState {
|
export enum LoadState {
|
||||||
|
|
@ -29,6 +29,20 @@ export enum QueueState {
|
||||||
DUPE_URL = 2,
|
DUPE_URL = 2,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
const normalizeOpts: Options = {
|
||||||
|
defaultProtocol: "https",
|
||||||
|
stripAuthentication: false,
|
||||||
|
stripTextFragment: false,
|
||||||
|
stripWWW: false,
|
||||||
|
stripHash: false,
|
||||||
|
removeTrailingSlash: false,
|
||||||
|
removeSingleSlash: false,
|
||||||
|
removeExplicitPort: true,
|
||||||
|
sortQueryParameters: true,
|
||||||
|
removePath: false,
|
||||||
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// treat 0 or 206 as 200 for purposes of dedup
|
// treat 0 or 206 as 200 for purposes of dedup
|
||||||
function normalizeDedupStatus(status: number): number {
|
function normalizeDedupStatus(status: number): number {
|
||||||
|
|
@ -687,7 +701,7 @@ return inx;
|
||||||
}: QueueEntry,
|
}: QueueEntry,
|
||||||
limit = 0,
|
limit = 0,
|
||||||
) {
|
) {
|
||||||
url = normalizeUrl(url);
|
url = normalizeUrl(url, normalizeOpts);
|
||||||
const added = this._timestamp();
|
const added = this._timestamp();
|
||||||
const data: QueueEntry = { added, url, seedId, depth, extraHops };
|
const data: QueueEntry = { added, url, seedId, depth, extraHops };
|
||||||
|
|
||||||
|
|
@ -1013,7 +1027,7 @@ return inx;
|
||||||
}
|
}
|
||||||
|
|
||||||
async addIfNoDupe(key: string, url: string, status: number) {
|
async addIfNoDupe(key: string, url: string, status: number) {
|
||||||
url = normalizeUrl(url);
|
url = normalizeUrl(url, normalizeOpts);
|
||||||
return (
|
return (
|
||||||
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
|
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
|
||||||
1
|
1
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue