diff --git a/package.json b/package.json index 60200fcc..41825f34 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "dependencies": { "@novnc/novnc": "1.4.0", "@puppeteer/replay": "^3.1.3", - "@webrecorder/wabac": "^2.24.5", + "@webrecorder/wabac": "^2.25.0", "browsertrix-behaviors": "^0.9.7", "client-zip": "^2.4.5", "css-selector-parser": "^3.0.5", diff --git a/src/util/robots.ts b/src/util/robots.ts index a6db043d..89945b6c 100644 --- a/src/util/robots.ts +++ b/src/util/robots.ts @@ -9,14 +9,21 @@ import { timedRun } from "./timing.js"; let headers: Record = {}; let crawlState: RedisCrawlState | null = null; -const pendingFetches: Map> = new Map< +type FetchResp = { + url?: string; + content: string; +}; + +const pendingFetches: Map> = new Map< string, - Promise + Promise >(); // max seconds to wait to fetch robots const ROBOTS_FETCH_TIMEOUT = 10; +const decoder = new TextDecoder(); + export function setRobotsConfig( _headers: Record, state: RedisCrawlState, @@ -65,12 +72,14 @@ async function fetchAndParseRobots( pendingFetches.set(robotsUrl, promise); } - const content = await promise; + const resp = await promise; - if (content === null) { + if (resp === null) { return null; } + const { url, content } = resp; + logger.debug( "Caching robots.txt body", { url: robotsUrl, ...logDetails }, @@ -78,6 +87,11 @@ async function fetchAndParseRobots( ); await crawlState!.setCachedRobots(robotsUrl, content); + // if redirected to a different domain /robots.txt, also set for that domain + if (url && url.match(/^https?:\/\/[^/]+\/robots.txt$/)) { + await crawlState!.setCachedRobots(url, content); + } + // empty string cached, but no need to create parser return content ? robotsParser(robotsUrl, content) : null; } catch (e) { @@ -99,7 +113,7 @@ async function fetchAndParseRobots( async function fetchRobots( url: string, logDetails: LogDetails, -): Promise { +): Promise { logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots"); const resp = await fetch(url, { @@ -110,7 +124,7 @@ async function fetchRobots( if (resp.ok) { const buff = await resp.arrayBuffer(); // only decode and store at most 100K - return new TextDecoder().decode(buff.slice(0, 100000)); + return { url: resp.url, content: decoder.decode(buff.slice(0, 100000)) }; } logger.debug( @@ -120,5 +134,5 @@ async function fetchRobots( ); // for other status errors, just return empty - return ""; + return { content: "" }; } diff --git a/yarn.lock b/yarn.lock index 94214f3f..dcd99118 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1134,16 +1134,16 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.24.5": - version "2.24.5" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.24.5.tgz#253b0d5a9b7691af475c7a158c6483df33f2d3ff" - integrity sha512-Wt9bdZMYrl+DYRkUdRfYAPSUfygHBQGIiZ9oMZ/DRLIeqsqeN36UBGuvbj6kmUawLNWRYVQj6kNl/ndtRZSa6w== +"@webrecorder/wabac@^2.25.0": + version "2.25.0" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.25.0.tgz#232eeb2dd9bdf58871aeeff6bfe4a18be30e0046" + integrity sha512-pJF0RavEGUwRGNWfhQtPgO7Ry7Sr1l50pf9RvvH3YCCNdRpcfhDw6oRf/Zc3V8z2UpBHLRInWwrjTiUV3WuJPg== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" "@types/js-levenshtein" "^1.1.3" - "@webrecorder/wombat" "^3.10.1" + "@webrecorder/wombat" "^3.10.2" acorn-loose "^8.5.2" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" @@ -1154,7 +1154,7 @@ http-status-codes "^2.1.4" idb "^7.1.1" js-levenshtein "^1.1.6" - js-yaml "^4.1.0" + js-yaml "^4.1.1" pako "^1.0.11" parse5-html-rewriting-stream "^7.0.0" parse5-sax-parser "^7.0.0" @@ -1163,10 +1163,10 @@ stream-browserify "^3.0.0" warcio "^2.4.7" -"@webrecorder/wombat@^3.10.1": - version "3.10.1" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.10.1.tgz#d45417f9ef61c9f02357bacf1ba930f6e09dd7fa" - integrity sha512-OzYrv3iqC1AdgjG+QD/tNQi6+yv+1izzr2PoN+40AvleL+RLsIk/AwOmdYJpz9r+L7UIOdRrK4v/JrUWPwUDzw== +"@webrecorder/wombat@^3.10.2": + version "3.10.2" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.10.2.tgz#0d8e4057be66e4b67a522b605f8be907f912d5e0" + integrity sha512-01vr0mf+ecggorX6nyxTyf8gYrFichu1UE8RQi71Ckz8DAf+mxym96ozecq9bcS80+4Bkb7vFLAGyb4h6ZRXAw== dependencies: warcio "^2.4.7" @@ -3749,6 +3749,13 @@ js-yaml@^4.1.0: dependencies: argparse "^2.0.1" +js-yaml@^4.1.1: + version "4.1.1" + resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.1.1.tgz#854c292467705b699476e1a2decc0c8a3458806b" + integrity sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA== + dependencies: + argparse "^2.0.1" + jsbn@1.1.0: version "1.1.0" resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040"