mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
use normalizeUrl to avoid differently sorted query args
This commit is contained in:
parent
805e2dceaa
commit
c91ccc5148
3 changed files with 9 additions and 1 deletions
|
|
@ -30,6 +30,7 @@
|
|||
"js-levenshtein": "^1.1.6",
|
||||
"js-yaml": "^4.1.0",
|
||||
"minio": "^7.1.3",
|
||||
"normalize-url": "^8.1.0",
|
||||
"p-queue": "^7.3.4",
|
||||
"pixelmatch": "^5.3.0",
|
||||
"pngjs": "^7.0.0",
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import {
|
|||
import { ScopedSeed } from "./seeds.js";
|
||||
import { Frame } from "puppeteer-core";
|
||||
import { interpolateFilename, UploadResult } from "./storage.js";
|
||||
import normalizeUrl from "normalize-url";
|
||||
|
||||
// ============================================================================
|
||||
export enum LoadState {
|
||||
|
|
@ -675,7 +676,6 @@ return inx;
|
|||
return res >= 3;
|
||||
}
|
||||
|
||||
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
||||
async addToQueue(
|
||||
{
|
||||
url,
|
||||
|
|
@ -687,6 +687,7 @@ return inx;
|
|||
}: QueueEntry,
|
||||
limit = 0,
|
||||
) {
|
||||
url = normalizeUrl(url);
|
||||
const added = this._timestamp();
|
||||
const data: QueueEntry = { added, url, seedId, depth, extraHops };
|
||||
|
||||
|
|
@ -1012,6 +1013,7 @@ return inx;
|
|||
}
|
||||
|
||||
async addIfNoDupe(key: string, url: string, status: number) {
|
||||
url = normalizeUrl(url);
|
||||
return (
|
||||
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
|
||||
1
|
||||
|
|
|
|||
|
|
@ -4150,6 +4150,11 @@ normalize-path@^3.0.0:
|
|||
resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65"
|
||||
integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==
|
||||
|
||||
normalize-url@^8.1.0:
|
||||
version "8.1.0"
|
||||
resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-8.1.0.tgz#d33504f67970decf612946fd4880bc8c0983486d"
|
||||
integrity sha512-X06Mfd/5aKsRHc0O0J5CUedwnPmnDtLF2+nq+KN9KSDlJHkPuh0JUviWjEWMe0SW/9TDdSLVPuk7L5gGTIA1/w==
|
||||
|
||||
npm-run-path@^4.0.1:
|
||||
version "4.0.1"
|
||||
resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue