mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
seed urls list: check for quoted URLs and remove quotes (#883)
- check for urls that are wrapped in quotes, eg. 'https://example.com/' or "https://example.com/" and trim and remove the quotes before adding seed - tests: add quoted URL to tests, fix old.webrecorder.net test - deps: update wabac.js, RWP to latest - logging: reduce error logging for seed lists, only log once that there are duplicates or page limit is reached - fix for #882
This commit is contained in:
parent
705bc0cd9f
commit
a2742df328
8 changed files with 68 additions and 33 deletions
|
@ -39,7 +39,7 @@ ADD config/ /app/
|
|||
|
||||
ADD html/ /app/html/
|
||||
|
||||
ARG RWP_VERSION=2.3.15
|
||||
ARG RWP_VERSION=2.3.17
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
"dependencies": {
|
||||
"@novnc/novnc": "1.4.0",
|
||||
"@puppeteer/replay": "^3.1.1",
|
||||
"@webrecorder/wabac": "^2.23.8",
|
||||
"@webrecorder/wabac": "^2.23.11",
|
||||
"browsertrix-behaviors": "^0.9.2",
|
||||
"client-zip": "^2.4.5",
|
||||
"css-selector-parser": "^3.0.5",
|
||||
|
@ -39,7 +39,7 @@
|
|||
"tsc": "^2.0.4",
|
||||
"undici": "^6.18.2",
|
||||
"uuid": "8.3.2",
|
||||
"warcio": "^2.4.4",
|
||||
"warcio": "^2.4.5",
|
||||
"ws": "^7.4.4",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
|
@ -71,7 +71,7 @@
|
|||
},
|
||||
"resolutions": {
|
||||
"wrap-ansi": "7.0.0",
|
||||
"warcio": "^2.4.4",
|
||||
"warcio": "^2.4.5",
|
||||
"@novnc/novnc": "1.4.0"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -129,6 +129,8 @@ export class Crawler {
|
|||
limitHit = false;
|
||||
pageLimit: number;
|
||||
|
||||
dupeSeedsFound = false;
|
||||
|
||||
saveStateFiles: string[] = [];
|
||||
lastSaveTime: number;
|
||||
|
||||
|
@ -2465,30 +2467,34 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
this.pageLimit,
|
||||
);
|
||||
|
||||
const logContext = depth === 0 ? "scope" : "links";
|
||||
const logLevel = depth === 0 ? "error" : "debug";
|
||||
|
||||
switch (result) {
|
||||
case QueueState.ADDED:
|
||||
logger.debug("Queued new page URL", { url, ...logDetails }, logContext);
|
||||
logger.debug("Queued new page URL", { url, ...logDetails }, "links");
|
||||
return true;
|
||||
|
||||
case QueueState.LIMIT_HIT:
|
||||
logger.logAsJSON(
|
||||
logger.debug(
|
||||
"Page URL not queued, at page limit",
|
||||
{ url, ...logDetails },
|
||||
logContext,
|
||||
logLevel,
|
||||
"links",
|
||||
);
|
||||
if (!this.limitHit && depth === 0) {
|
||||
logger.error(
|
||||
"Page limit reached when adding URL list, some URLs not crawled.",
|
||||
);
|
||||
}
|
||||
this.limitHit = true;
|
||||
return false;
|
||||
|
||||
case QueueState.DUPE_URL:
|
||||
logger.logAsJSON(
|
||||
if (!this.dupeSeedsFound && depth === 0) {
|
||||
logger.error("Duplicate seed URLs found and skipped");
|
||||
this.dupeSeedsFound = true;
|
||||
}
|
||||
logger.debug(
|
||||
"Page URL not queued, already seen",
|
||||
{ url, ...logDetails },
|
||||
logContext,
|
||||
logLevel,
|
||||
"links",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -342,6 +342,7 @@ export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
|
|||
|
||||
for (const seed of seeds) {
|
||||
const newSeed = typeof seed === "string" ? { url: seed } : seed;
|
||||
newSeed.url = removeQuotes(newSeed.url);
|
||||
|
||||
try {
|
||||
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
|
||||
|
@ -389,3 +390,14 @@ export function parseRx(
|
|||
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
|
||||
}
|
||||
}
|
||||
|
||||
export function removeQuotes(url: string) {
|
||||
url = url.trim();
|
||||
if (
|
||||
(url.startsWith(`"`) && url.endsWith(`"`)) ||
|
||||
(url.startsWith(`'`) && url.endsWith(`'`))
|
||||
) {
|
||||
url = url.slice(1, -1);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
|
3
tests/fixtures/urlSeedFile.txt
vendored
3
tests/fixtures/urlSeedFile.txt
vendored
|
@ -1,2 +1,3 @@
|
|||
https://webrecorder.net/about/
|
||||
https://old.webrecorder.net/about/
|
||||
https://specs.webrecorder.net/wacz/1.1.1/
|
||||
"https://old.webrecorder.net/faq"
|
||||
|
|
|
@ -118,9 +118,9 @@ function validateResourcesIndex(json) {
|
|||
{ status: 200, mime: "text/css", type: "stylesheet" },
|
||||
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
||||
{ status: 200, mime: "text/css", type: "stylesheet" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://old.webrecorder.net/assets/favicon.ico": {
|
||||
status: 200,
|
||||
|
@ -161,9 +161,9 @@ function validateResourcesAbout(json) {
|
|||
mime: "image/svg+xml",
|
||||
type: "image",
|
||||
},
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,13 +1,30 @@
|
|||
import util from "util";
|
||||
import { exec as execCallback } from "child_process";
|
||||
import { spawn, exec as execCallback } from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
let proc = null;
|
||||
|
||||
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
|
||||
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
|
||||
|
||||
beforeAll(() => {
|
||||
proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"});
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
if (proc) {
|
||||
proc.kill();
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
|
||||
test("check that URLs in seed-list are crawled", async () => {
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000 --scopeType page",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
@ -43,7 +60,7 @@ test("check that URLs in seed-list are crawled", async () => {
|
|||
test("check that URLs in seed-list hosted at URL are crawled", async () => {
|
||||
try {
|
||||
await exec(
|
||||
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
|
||||
`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "${TEST_HOST}/urlSeedFile.txt" --timeout 90000 --scopeType page`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
|
21
yarn.lock
21
yarn.lock
|
@ -1134,10 +1134,10 @@
|
|||
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
||||
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
||||
|
||||
"@webrecorder/wabac@^2.23.8":
|
||||
version "2.23.8"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.8.tgz#a3eb1e605acb706b6f043ec9e7fae9ff412ccc8a"
|
||||
integrity sha512-+ShHsaBHwFC0SPFTpMWrwJHd47MzT6o1Rg12FSfGfpycrcmrBV447+JR28NitLJIsfcIif8xAth9Vh5Z7tHWlQ==
|
||||
"@webrecorder/wabac@^2.23.11":
|
||||
version "2.23.11"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.11.tgz#945da06e08b6d093b525e6e5bfd6a8f17beb995b"
|
||||
integrity sha512-rsBAkcYvgX+0HgwhgvSb3cBCBp0rVnHGQS/K5A9aJwOmfymHt0C2vInH/lmKV/5H38rJu29c2cvRX962h+lUiw==
|
||||
dependencies:
|
||||
"@peculiar/asn1-ecc" "^2.3.4"
|
||||
"@peculiar/asn1-schema" "^2.3.3"
|
||||
|
@ -1151,7 +1151,6 @@
|
|||
buffer "^6.0.3"
|
||||
fast-xml-parser "^4.4.1"
|
||||
hash-wasm "^4.9.0"
|
||||
http-link-header "^1.1.3"
|
||||
http-status-codes "^2.1.4"
|
||||
idb "^7.1.1"
|
||||
js-levenshtein "^1.1.6"
|
||||
|
@ -1162,7 +1161,7 @@
|
|||
path-parser "^6.1.0"
|
||||
process "^0.11.10"
|
||||
stream-browserify "^3.0.0"
|
||||
warcio "^2.4.3"
|
||||
warcio "^2.4.5"
|
||||
|
||||
"@webrecorder/wombat@^3.8.14":
|
||||
version "3.8.14"
|
||||
|
@ -2834,7 +2833,7 @@ html-escaper@^2.0.0:
|
|||
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
|
||||
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
|
||||
|
||||
http-link-header@^1.1.1, http-link-header@^1.1.3:
|
||||
http-link-header@^1.1.1:
|
||||
version "1.1.3"
|
||||
resolved "https://registry.yarnpkg.com/http-link-header/-/http-link-header-1.1.3.tgz#b367b7a0ad1cf14027953f31aa1df40bb433da2a"
|
||||
integrity sha512-3cZ0SRL8fb9MUlU3mKM61FcQvPfXx2dBrZW3Vbg5CXa8jFlK8OaEpePenLe1oEXQduhz8b0QjsqfS59QP4AJDQ==
|
||||
|
@ -5527,10 +5526,10 @@ walker@^1.0.8:
|
|||
dependencies:
|
||||
makeerror "1.0.12"
|
||||
|
||||
warcio@^2.4.0, warcio@^2.4.3, warcio@^2.4.4:
|
||||
version "2.4.4"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.4.tgz#6c0c030bb55c0f0b824f854fa9e6718ca25d333d"
|
||||
integrity sha512-FrWOhv1qLNhPBPGEMm24Yo+DtkipK5DxK3ckVGbOf0OJ/UqaxAhiiby74q+GW70dsJV0wF+RA1ToK6CKseTshA==
|
||||
warcio@^2.4.0, warcio@^2.4.5:
|
||||
version "2.4.5"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.5.tgz#ba39c38e433491ab9016282813b9cf6539c3d808"
|
||||
integrity sha512-b6R/aIsR4fXzrpY/Zud7LqHFi2Bt8Ov5VLOnruHQ10rk129e9d0KOCZlyRmPD6ENTcV7yze5rXvJ5WSNS8R1zw==
|
||||
dependencies:
|
||||
"@types/pako" "^1.0.7"
|
||||
"@types/stream-buffers" "^3.0.7"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue