seed urls list: check for quoted URLs and remove quotes (#883)

- check for urls that are wrapped in quotes, eg. 'https://example.com/'
or "https://example.com/" and trim and remove the quotes before adding seed
- tests: add quoted URL to tests, fix old.webrecorder.net test
- deps: update wabac.js, RWP to latest
- logging: reduce error logging for seed lists, only log once that there are duplicates or page limit is reached
- fix for #882
This commit is contained in:
Ilya Kreymer 2025-09-12 13:34:41 -07:00 committed by GitHub
parent 705bc0cd9f
commit a2742df328
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 68 additions and 33 deletions

View file

@ -39,7 +39,7 @@ ADD config/ /app/
ADD html/ /app/html/
ARG RWP_VERSION=2.3.15
ARG RWP_VERSION=2.3.17
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz

View file

@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@puppeteer/replay": "^3.1.1",
"@webrecorder/wabac": "^2.23.8",
"@webrecorder/wabac": "^2.23.11",
"browsertrix-behaviors": "^0.9.2",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
@ -39,7 +39,7 @@
"tsc": "^2.0.4",
"undici": "^6.18.2",
"uuid": "8.3.2",
"warcio": "^2.4.4",
"warcio": "^2.4.5",
"ws": "^7.4.4",
"yargs": "^17.7.2"
},
@ -71,7 +71,7 @@
},
"resolutions": {
"wrap-ansi": "7.0.0",
"warcio": "^2.4.4",
"warcio": "^2.4.5",
"@novnc/novnc": "1.4.0"
}
}

View file

@ -129,6 +129,8 @@ export class Crawler {
limitHit = false;
pageLimit: number;
dupeSeedsFound = false;
saveStateFiles: string[] = [];
lastSaveTime: number;
@ -2465,30 +2467,34 @@ self.__bx_behaviors.selectMainBehavior();
this.pageLimit,
);
const logContext = depth === 0 ? "scope" : "links";
const logLevel = depth === 0 ? "error" : "debug";
switch (result) {
case QueueState.ADDED:
logger.debug("Queued new page URL", { url, ...logDetails }, logContext);
logger.debug("Queued new page URL", { url, ...logDetails }, "links");
return true;
case QueueState.LIMIT_HIT:
logger.logAsJSON(
logger.debug(
"Page URL not queued, at page limit",
{ url, ...logDetails },
logContext,
logLevel,
"links",
);
if (!this.limitHit && depth === 0) {
logger.error(
"Page limit reached when adding URL list, some URLs not crawled.",
);
}
this.limitHit = true;
return false;
case QueueState.DUPE_URL:
logger.logAsJSON(
if (!this.dupeSeedsFound && depth === 0) {
logger.error("Duplicate seed URLs found and skipped");
this.dupeSeedsFound = true;
}
logger.debug(
"Page URL not queued, already seen",
{ url, ...logDetails },
logContext,
logLevel,
"links",
);
return false;
}

View file

@ -342,6 +342,7 @@ export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
for (const seed of seeds) {
const newSeed = typeof seed === "string" ? { url: seed } : seed;
newSeed.url = removeQuotes(newSeed.url);
try {
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
@ -389,3 +390,14 @@ export function parseRx(
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
}
}
export function removeQuotes(url: string) {
url = url.trim();
if (
(url.startsWith(`"`) && url.endsWith(`"`)) ||
(url.startsWith(`'`) && url.endsWith(`'`))
) {
url = url.slice(1, -1);
}
return url;
}

View file

@ -1,2 +1,3 @@
https://webrecorder.net/about/
https://old.webrecorder.net/about/
https://specs.webrecorder.net/wacz/1.1.1/
"https://old.webrecorder.net/faq"

View file

@ -118,9 +118,9 @@ function validateResourcesIndex(json) {
{ status: 200, mime: "text/css", type: "stylesheet" },
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
{ status: 200, mime: "text/css", type: "stylesheet" },
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
{ status: 200, mime: "font/woff2", type: "font" },
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
{ status: 200, mime: "font/woff2", type: "font" },
"https://old.webrecorder.net/assets/favicon.ico": {
status: 200,
@ -161,9 +161,9 @@ function validateResourcesAbout(json) {
mime: "image/svg+xml",
type: "image",
},
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
{ status: 200, mime: "font/woff2", type: "font" },
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
{ status: 200, mime: "font/woff2", type: "font" },
});
}

View file

@ -1,13 +1,30 @@
import util from "util";
import { exec as execCallback } from "child_process";
import { spawn, exec as execCallback } from "child_process";
import fs from "fs";
const exec = util.promisify(execCallback);
let proc = null;
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
beforeAll(() => {
proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"});
});
afterAll(() => {
if (proc) {
proc.kill();
}
});
test("check that URLs in seed-list are crawled", async () => {
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000 --scopeType page",
);
} catch (error) {
console.log(error);
@ -43,7 +60,7 @@ test("check that URLs in seed-list are crawled", async () => {
test("check that URLs in seed-list hosted at URL are crawled", async () => {
try {
await exec(
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "${TEST_HOST}/urlSeedFile.txt" --timeout 90000 --scopeType page`,
);
} catch (error) {
console.log(error);

View file

@ -1134,10 +1134,10 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.23.8":
version "2.23.8"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.8.tgz#a3eb1e605acb706b6f043ec9e7fae9ff412ccc8a"
integrity sha512-+ShHsaBHwFC0SPFTpMWrwJHd47MzT6o1Rg12FSfGfpycrcmrBV447+JR28NitLJIsfcIif8xAth9Vh5Z7tHWlQ==
"@webrecorder/wabac@^2.23.11":
version "2.23.11"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.11.tgz#945da06e08b6d093b525e6e5bfd6a8f17beb995b"
integrity sha512-rsBAkcYvgX+0HgwhgvSb3cBCBp0rVnHGQS/K5A9aJwOmfymHt0C2vInH/lmKV/5H38rJu29c2cvRX962h+lUiw==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
@ -1151,7 +1151,6 @@
buffer "^6.0.3"
fast-xml-parser "^4.4.1"
hash-wasm "^4.9.0"
http-link-header "^1.1.3"
http-status-codes "^2.1.4"
idb "^7.1.1"
js-levenshtein "^1.1.6"
@ -1162,7 +1161,7 @@
path-parser "^6.1.0"
process "^0.11.10"
stream-browserify "^3.0.0"
warcio "^2.4.3"
warcio "^2.4.5"
"@webrecorder/wombat@^3.8.14":
version "3.8.14"
@ -2834,7 +2833,7 @@ html-escaper@^2.0.0:
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
http-link-header@^1.1.1, http-link-header@^1.1.3:
http-link-header@^1.1.1:
version "1.1.3"
resolved "https://registry.yarnpkg.com/http-link-header/-/http-link-header-1.1.3.tgz#b367b7a0ad1cf14027953f31aa1df40bb433da2a"
integrity sha512-3cZ0SRL8fb9MUlU3mKM61FcQvPfXx2dBrZW3Vbg5CXa8jFlK8OaEpePenLe1oEXQduhz8b0QjsqfS59QP4AJDQ==
@ -5527,10 +5526,10 @@ walker@^1.0.8:
dependencies:
makeerror "1.0.12"
warcio@^2.4.0, warcio@^2.4.3, warcio@^2.4.4:
version "2.4.4"
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.4.tgz#6c0c030bb55c0f0b824f854fa9e6718ca25d333d"
integrity sha512-FrWOhv1qLNhPBPGEMm24Yo+DtkipK5DxK3ckVGbOf0OJ/UqaxAhiiby74q+GW70dsJV0wF+RA1ToK6CKseTshA==
warcio@^2.4.0, warcio@^2.4.5:
version "2.4.5"
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.5.tgz#ba39c38e433491ab9016282813b9cf6539c3d808"
integrity sha512-b6R/aIsR4fXzrpY/Zud7LqHFi2Bt8Ov5VLOnruHQ10rk129e9d0KOCZlyRmPD6ENTcV7yze5rXvJ5WSNS8R1zw==
dependencies:
"@types/pako" "^1.0.7"
"@types/stream-buffers" "^3.0.7"