From a2742df3282fe350ce1b82fe6a598f5dd9ccc03e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 12 Sep 2025 13:34:41 -0700 Subject: [PATCH] seed urls list: check for quoted URLs and remove quotes (#883) - check for urls that are wrapped in quotes, eg. 'https://example.com/' or "https://example.com/" and trim and remove the quotes before adding seed - tests: add quoted URL to tests, fix old.webrecorder.net test - deps: update wabac.js, RWP to latest - logging: reduce error logging for seed lists, only log once that there are duplicates or page limit is reached - fix for #882 --- Dockerfile | 2 +- package.json | 6 +++--- src/crawler.ts | 26 ++++++++++++++++---------- src/util/seeds.ts | 12 ++++++++++++ tests/fixtures/urlSeedFile.txt | 3 ++- tests/pageinfo-records.test.js | 8 ++++---- tests/url_file_list.test.js | 23 ++++++++++++++++++++--- yarn.lock | 21 ++++++++++----------- 8 files changed, 68 insertions(+), 33 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0cadfc14..ca8cb670 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ ADD config/ /app/ ADD html/ /app/html/ -ARG RWP_VERSION=2.3.15 +ARG RWP_VERSION=2.3.17 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz diff --git a/package.json b/package.json index 6e7da0b7..fd4c263d 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "dependencies": { "@novnc/novnc": "1.4.0", "@puppeteer/replay": "^3.1.1", - "@webrecorder/wabac": "^2.23.8", + "@webrecorder/wabac": "^2.23.11", "browsertrix-behaviors": "^0.9.2", "client-zip": "^2.4.5", "css-selector-parser": "^3.0.5", @@ -39,7 +39,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.4.4", + "warcio": "^2.4.5", "ws": "^7.4.4", "yargs": "^17.7.2" }, @@ -71,7 +71,7 @@ }, "resolutions": { "wrap-ansi": "7.0.0", - "warcio": "^2.4.4", + "warcio": "^2.4.5", "@novnc/novnc": "1.4.0" } } diff --git a/src/crawler.ts b/src/crawler.ts index 3b1805b3..39b1b2f5 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -129,6 +129,8 @@ export class Crawler { limitHit = false; pageLimit: number; + dupeSeedsFound = false; + saveStateFiles: string[] = []; lastSaveTime: number; @@ -2465,30 +2467,34 @@ self.__bx_behaviors.selectMainBehavior(); this.pageLimit, ); - const logContext = depth === 0 ? "scope" : "links"; - const logLevel = depth === 0 ? "error" : "debug"; - switch (result) { case QueueState.ADDED: - logger.debug("Queued new page URL", { url, ...logDetails }, logContext); + logger.debug("Queued new page URL", { url, ...logDetails }, "links"); return true; case QueueState.LIMIT_HIT: - logger.logAsJSON( + logger.debug( "Page URL not queued, at page limit", { url, ...logDetails }, - logContext, - logLevel, + "links", ); + if (!this.limitHit && depth === 0) { + logger.error( + "Page limit reached when adding URL list, some URLs not crawled.", + ); + } this.limitHit = true; return false; case QueueState.DUPE_URL: - logger.logAsJSON( + if (!this.dupeSeedsFound && depth === 0) { + logger.error("Duplicate seed URLs found and skipped"); + this.dupeSeedsFound = true; + } + logger.debug( "Page URL not queued, already seen", { url, ...logDetails }, - logContext, - logLevel, + "links", ); return false; } diff --git a/src/util/seeds.ts b/src/util/seeds.ts index ade15208..efdb0b8d 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -342,6 +342,7 @@ export async function parseSeeds(params: CrawlerArgs): Promise { for (const seed of seeds) { const newSeed = typeof seed === "string" ? { url: seed } : seed; + newSeed.url = removeQuotes(newSeed.url); try { scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); @@ -389,3 +390,14 @@ export function parseRx( return value.map((e) => (e instanceof RegExp ? e : new RegExp(e))); } } + +export function removeQuotes(url: string) { + url = url.trim(); + if ( + (url.startsWith(`"`) && url.endsWith(`"`)) || + (url.startsWith(`'`) && url.endsWith(`'`)) + ) { + url = url.slice(1, -1); + } + return url; +} diff --git a/tests/fixtures/urlSeedFile.txt b/tests/fixtures/urlSeedFile.txt index 5d850254..b4ebe0e6 100644 --- a/tests/fixtures/urlSeedFile.txt +++ b/tests/fixtures/urlSeedFile.txt @@ -1,2 +1,3 @@ -https://webrecorder.net/about/ +https://old.webrecorder.net/about/ https://specs.webrecorder.net/wacz/1.1.1/ +"https://old.webrecorder.net/faq" diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 061dbe8c..fc6b6168 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -118,9 +118,9 @@ function validateResourcesIndex(json) { { status: 200, mime: "text/css", type: "stylesheet" }, "https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap": { status: 200, mime: "text/css", type: "stylesheet" }, - "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": + "https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": { status: 200, mime: "font/woff2", type: "font" }, - "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": + "https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": { status: 200, mime: "font/woff2", type: "font" }, "https://old.webrecorder.net/assets/favicon.ico": { status: 200, @@ -161,9 +161,9 @@ function validateResourcesAbout(json) { mime: "image/svg+xml", type: "image", }, - "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": + "https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": { status: 200, mime: "font/woff2", type: "font" }, - "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": + "https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": { status: 200, mime: "font/woff2", type: "font" }, }); } diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index c76afa6e..6ceab34a 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -1,13 +1,30 @@ import util from "util"; -import { exec as execCallback } from "child_process"; +import { spawn, exec as execCallback } from "child_process"; import fs from "fs"; const exec = util.promisify(execCallback); +let proc = null; + +const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; +const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`; + +beforeAll(() => { + proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"}); +}); + +afterAll(() => { + if (proc) { + proc.kill(); + } +}); + + + test("check that URLs in seed-list are crawled", async () => { try { await exec( - "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000", + "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000 --scopeType page", ); } catch (error) { console.log(error); @@ -43,7 +60,7 @@ test("check that URLs in seed-list are crawled", async () => { test("check that URLs in seed-list hosted at URL are crawled", async () => { try { await exec( - 'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000', + `docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "${TEST_HOST}/urlSeedFile.txt" --timeout 90000 --scopeType page`, ); } catch (error) { console.log(error); diff --git a/yarn.lock b/yarn.lock index faa1fa15..49ca9c85 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1134,10 +1134,10 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.23.8": - version "2.23.8" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.8.tgz#a3eb1e605acb706b6f043ec9e7fae9ff412ccc8a" - integrity sha512-+ShHsaBHwFC0SPFTpMWrwJHd47MzT6o1Rg12FSfGfpycrcmrBV447+JR28NitLJIsfcIif8xAth9Vh5Z7tHWlQ== +"@webrecorder/wabac@^2.23.11": + version "2.23.11" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.11.tgz#945da06e08b6d093b525e6e5bfd6a8f17beb995b" + integrity sha512-rsBAkcYvgX+0HgwhgvSb3cBCBp0rVnHGQS/K5A9aJwOmfymHt0C2vInH/lmKV/5H38rJu29c2cvRX962h+lUiw== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" @@ -1151,7 +1151,6 @@ buffer "^6.0.3" fast-xml-parser "^4.4.1" hash-wasm "^4.9.0" - http-link-header "^1.1.3" http-status-codes "^2.1.4" idb "^7.1.1" js-levenshtein "^1.1.6" @@ -1162,7 +1161,7 @@ path-parser "^6.1.0" process "^0.11.10" stream-browserify "^3.0.0" - warcio "^2.4.3" + warcio "^2.4.5" "@webrecorder/wombat@^3.8.14": version "3.8.14" @@ -2834,7 +2833,7 @@ html-escaper@^2.0.0: resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== -http-link-header@^1.1.1, http-link-header@^1.1.3: +http-link-header@^1.1.1: version "1.1.3" resolved "https://registry.yarnpkg.com/http-link-header/-/http-link-header-1.1.3.tgz#b367b7a0ad1cf14027953f31aa1df40bb433da2a" integrity sha512-3cZ0SRL8fb9MUlU3mKM61FcQvPfXx2dBrZW3Vbg5CXa8jFlK8OaEpePenLe1oEXQduhz8b0QjsqfS59QP4AJDQ== @@ -5527,10 +5526,10 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.4.0, warcio@^2.4.3, warcio@^2.4.4: - version "2.4.4" - resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.4.tgz#6c0c030bb55c0f0b824f854fa9e6718ca25d333d" - integrity sha512-FrWOhv1qLNhPBPGEMm24Yo+DtkipK5DxK3ckVGbOf0OJ/UqaxAhiiby74q+GW70dsJV0wF+RA1ToK6CKseTshA== +warcio@^2.4.0, warcio@^2.4.5: + version "2.4.5" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.5.tgz#ba39c38e433491ab9016282813b9cf6539c3d808" + integrity sha512-b6R/aIsR4fXzrpY/Zud7LqHFi2Bt8Ov5VLOnruHQ10rk129e9d0KOCZlyRmPD6ENTcV7yze5rXvJ5WSNS8R1zw== dependencies: "@types/pako" "^1.0.7" "@types/stream-buffers" "^3.0.7"