mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
robots tweaks:
- if redirected to a different site's /robots.txt, cache entry for that site also - deps: bump to wabac.js 2.25.0
This commit is contained in:
parent
8658df3999
commit
081272a3f6
3 changed files with 39 additions and 18 deletions
|
|
@ -18,7 +18,7 @@
|
|||
"dependencies": {
|
||||
"@novnc/novnc": "1.4.0",
|
||||
"@puppeteer/replay": "^3.1.3",
|
||||
"@webrecorder/wabac": "^2.24.5",
|
||||
"@webrecorder/wabac": "^2.25.0",
|
||||
"browsertrix-behaviors": "^0.9.7",
|
||||
"client-zip": "^2.4.5",
|
||||
"css-selector-parser": "^3.0.5",
|
||||
|
|
|
|||
|
|
@ -9,14 +9,21 @@ import { timedRun } from "./timing.js";
|
|||
let headers: Record<string, string> = {};
|
||||
let crawlState: RedisCrawlState | null = null;
|
||||
|
||||
const pendingFetches: Map<string, Promise<string>> = new Map<
|
||||
type FetchResp = {
|
||||
url?: string;
|
||||
content: string;
|
||||
};
|
||||
|
||||
const pendingFetches: Map<string, Promise<FetchResp>> = new Map<
|
||||
string,
|
||||
Promise<string>
|
||||
Promise<FetchResp>
|
||||
>();
|
||||
|
||||
// max seconds to wait to fetch robots
|
||||
const ROBOTS_FETCH_TIMEOUT = 10;
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
export function setRobotsConfig(
|
||||
_headers: Record<string, string>,
|
||||
state: RedisCrawlState,
|
||||
|
|
@ -65,12 +72,14 @@ async function fetchAndParseRobots(
|
|||
pendingFetches.set(robotsUrl, promise);
|
||||
}
|
||||
|
||||
const content = await promise;
|
||||
const resp = await promise;
|
||||
|
||||
if (content === null) {
|
||||
if (resp === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const { url, content } = resp;
|
||||
|
||||
logger.debug(
|
||||
"Caching robots.txt body",
|
||||
{ url: robotsUrl, ...logDetails },
|
||||
|
|
@ -78,6 +87,11 @@ async function fetchAndParseRobots(
|
|||
);
|
||||
await crawlState!.setCachedRobots(robotsUrl, content);
|
||||
|
||||
// if redirected to a different domain /robots.txt, also set for that domain
|
||||
if (url && url.match(/^https?:\/\/[^/]+\/robots.txt$/)) {
|
||||
await crawlState!.setCachedRobots(url, content);
|
||||
}
|
||||
|
||||
// empty string cached, but no need to create parser
|
||||
return content ? robotsParser(robotsUrl, content) : null;
|
||||
} catch (e) {
|
||||
|
|
@ -99,7 +113,7 @@ async function fetchAndParseRobots(
|
|||
async function fetchRobots(
|
||||
url: string,
|
||||
logDetails: LogDetails,
|
||||
): Promise<string | null> {
|
||||
): Promise<FetchResp | null> {
|
||||
logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
|
||||
|
||||
const resp = await fetch(url, {
|
||||
|
|
@ -110,7 +124,7 @@ async function fetchRobots(
|
|||
if (resp.ok) {
|
||||
const buff = await resp.arrayBuffer();
|
||||
// only decode and store at most 100K
|
||||
return new TextDecoder().decode(buff.slice(0, 100000));
|
||||
return { url: resp.url, content: decoder.decode(buff.slice(0, 100000)) };
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
|
|
@ -120,5 +134,5 @@ async function fetchRobots(
|
|||
);
|
||||
|
||||
// for other status errors, just return empty
|
||||
return "";
|
||||
return { content: "" };
|
||||
}
|
||||
|
|
|
|||
27
yarn.lock
27
yarn.lock
|
|
@ -1134,16 +1134,16 @@
|
|||
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
||||
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
||||
|
||||
"@webrecorder/wabac@^2.24.5":
|
||||
version "2.24.5"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.24.5.tgz#253b0d5a9b7691af475c7a158c6483df33f2d3ff"
|
||||
integrity sha512-Wt9bdZMYrl+DYRkUdRfYAPSUfygHBQGIiZ9oMZ/DRLIeqsqeN36UBGuvbj6kmUawLNWRYVQj6kNl/ndtRZSa6w==
|
||||
"@webrecorder/wabac@^2.25.0":
|
||||
version "2.25.0"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.25.0.tgz#232eeb2dd9bdf58871aeeff6bfe4a18be30e0046"
|
||||
integrity sha512-pJF0RavEGUwRGNWfhQtPgO7Ry7Sr1l50pf9RvvH3YCCNdRpcfhDw6oRf/Zc3V8z2UpBHLRInWwrjTiUV3WuJPg==
|
||||
dependencies:
|
||||
"@peculiar/asn1-ecc" "^2.3.4"
|
||||
"@peculiar/asn1-schema" "^2.3.3"
|
||||
"@peculiar/x509" "^1.9.2"
|
||||
"@types/js-levenshtein" "^1.1.3"
|
||||
"@webrecorder/wombat" "^3.10.1"
|
||||
"@webrecorder/wombat" "^3.10.2"
|
||||
acorn-loose "^8.5.2"
|
||||
auto-js-ipfs "^2.1.1"
|
||||
base64-js "^1.5.1"
|
||||
|
|
@ -1154,7 +1154,7 @@
|
|||
http-status-codes "^2.1.4"
|
||||
idb "^7.1.1"
|
||||
js-levenshtein "^1.1.6"
|
||||
js-yaml "^4.1.0"
|
||||
js-yaml "^4.1.1"
|
||||
pako "^1.0.11"
|
||||
parse5-html-rewriting-stream "^7.0.0"
|
||||
parse5-sax-parser "^7.0.0"
|
||||
|
|
@ -1163,10 +1163,10 @@
|
|||
stream-browserify "^3.0.0"
|
||||
warcio "^2.4.7"
|
||||
|
||||
"@webrecorder/wombat@^3.10.1":
|
||||
version "3.10.1"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.10.1.tgz#d45417f9ef61c9f02357bacf1ba930f6e09dd7fa"
|
||||
integrity sha512-OzYrv3iqC1AdgjG+QD/tNQi6+yv+1izzr2PoN+40AvleL+RLsIk/AwOmdYJpz9r+L7UIOdRrK4v/JrUWPwUDzw==
|
||||
"@webrecorder/wombat@^3.10.2":
|
||||
version "3.10.2"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.10.2.tgz#0d8e4057be66e4b67a522b605f8be907f912d5e0"
|
||||
integrity sha512-01vr0mf+ecggorX6nyxTyf8gYrFichu1UE8RQi71Ckz8DAf+mxym96ozecq9bcS80+4Bkb7vFLAGyb4h6ZRXAw==
|
||||
dependencies:
|
||||
warcio "^2.4.7"
|
||||
|
||||
|
|
@ -3749,6 +3749,13 @@ js-yaml@^4.1.0:
|
|||
dependencies:
|
||||
argparse "^2.0.1"
|
||||
|
||||
js-yaml@^4.1.1:
|
||||
version "4.1.1"
|
||||
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.1.1.tgz#854c292467705b699476e1a2decc0c8a3458806b"
|
||||
integrity sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==
|
||||
dependencies:
|
||||
argparse "^2.0.1"
|
||||
|
||||
jsbn@1.1.0:
|
||||
version "1.1.0"
|
||||
resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue