robots tweaks:

- if redirected to a different site's /robots.txt, cache entry for that site also
- deps: bump to wabac.js 2.25.0
This commit is contained in:
Ilya Kreymer 2025-11-27 14:59:25 -08:00
parent 8658df3999
commit 081272a3f6
3 changed files with 39 additions and 18 deletions

View file

@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@puppeteer/replay": "^3.1.3",
"@webrecorder/wabac": "^2.24.5",
"@webrecorder/wabac": "^2.25.0",
"browsertrix-behaviors": "^0.9.7",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",

View file

@ -9,14 +9,21 @@ import { timedRun } from "./timing.js";
let headers: Record<string, string> = {};
let crawlState: RedisCrawlState | null = null;
const pendingFetches: Map<string, Promise<string>> = new Map<
type FetchResp = {
url?: string;
content: string;
};
const pendingFetches: Map<string, Promise<FetchResp>> = new Map<
string,
Promise<string>
Promise<FetchResp>
>();
// max seconds to wait to fetch robots
const ROBOTS_FETCH_TIMEOUT = 10;
const decoder = new TextDecoder();
export function setRobotsConfig(
_headers: Record<string, string>,
state: RedisCrawlState,
@ -65,12 +72,14 @@ async function fetchAndParseRobots(
pendingFetches.set(robotsUrl, promise);
}
const content = await promise;
const resp = await promise;
if (content === null) {
if (resp === null) {
return null;
}
const { url, content } = resp;
logger.debug(
"Caching robots.txt body",
{ url: robotsUrl, ...logDetails },
@ -78,6 +87,11 @@ async function fetchAndParseRobots(
);
await crawlState!.setCachedRobots(robotsUrl, content);
// if redirected to a different domain /robots.txt, also set for that domain
if (url && url.match(/^https?:\/\/[^/]+\/robots.txt$/)) {
await crawlState!.setCachedRobots(url, content);
}
// empty string cached, but no need to create parser
return content ? robotsParser(robotsUrl, content) : null;
} catch (e) {
@ -99,7 +113,7 @@ async function fetchAndParseRobots(
async function fetchRobots(
url: string,
logDetails: LogDetails,
): Promise<string | null> {
): Promise<FetchResp | null> {
logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
const resp = await fetch(url, {
@ -110,7 +124,7 @@ async function fetchRobots(
if (resp.ok) {
const buff = await resp.arrayBuffer();
// only decode and store at most 100K
return new TextDecoder().decode(buff.slice(0, 100000));
return { url: resp.url, content: decoder.decode(buff.slice(0, 100000)) };
}
logger.debug(
@ -120,5 +134,5 @@ async function fetchRobots(
);
// for other status errors, just return empty
return "";
return { content: "" };
}

View file

@ -1134,16 +1134,16 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.24.5":
version "2.24.5"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.24.5.tgz#253b0d5a9b7691af475c7a158c6483df33f2d3ff"
integrity sha512-Wt9bdZMYrl+DYRkUdRfYAPSUfygHBQGIiZ9oMZ/DRLIeqsqeN36UBGuvbj6kmUawLNWRYVQj6kNl/ndtRZSa6w==
"@webrecorder/wabac@^2.25.0":
version "2.25.0"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.25.0.tgz#232eeb2dd9bdf58871aeeff6bfe4a18be30e0046"
integrity sha512-pJF0RavEGUwRGNWfhQtPgO7Ry7Sr1l50pf9RvvH3YCCNdRpcfhDw6oRf/Zc3V8z2UpBHLRInWwrjTiUV3WuJPg==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
"@peculiar/x509" "^1.9.2"
"@types/js-levenshtein" "^1.1.3"
"@webrecorder/wombat" "^3.10.1"
"@webrecorder/wombat" "^3.10.2"
acorn-loose "^8.5.2"
auto-js-ipfs "^2.1.1"
base64-js "^1.5.1"
@ -1154,7 +1154,7 @@
http-status-codes "^2.1.4"
idb "^7.1.1"
js-levenshtein "^1.1.6"
js-yaml "^4.1.0"
js-yaml "^4.1.1"
pako "^1.0.11"
parse5-html-rewriting-stream "^7.0.0"
parse5-sax-parser "^7.0.0"
@ -1163,10 +1163,10 @@
stream-browserify "^3.0.0"
warcio "^2.4.7"
"@webrecorder/wombat@^3.10.1":
version "3.10.1"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.10.1.tgz#d45417f9ef61c9f02357bacf1ba930f6e09dd7fa"
integrity sha512-OzYrv3iqC1AdgjG+QD/tNQi6+yv+1izzr2PoN+40AvleL+RLsIk/AwOmdYJpz9r+L7UIOdRrK4v/JrUWPwUDzw==
"@webrecorder/wombat@^3.10.2":
version "3.10.2"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.10.2.tgz#0d8e4057be66e4b67a522b605f8be907f912d5e0"
integrity sha512-01vr0mf+ecggorX6nyxTyf8gYrFichu1UE8RQi71Ckz8DAf+mxym96ozecq9bcS80+4Bkb7vFLAGyb4h6ZRXAw==
dependencies:
warcio "^2.4.7"
@ -3749,6 +3749,13 @@ js-yaml@^4.1.0:
dependencies:
argparse "^2.0.1"
js-yaml@^4.1.1:
version "4.1.1"
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.1.1.tgz#854c292467705b699476e1a2decc0c8a3458806b"
integrity sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==
dependencies:
argparse "^2.0.1"
jsbn@1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040"