Add option to respect robots.txt disallows (#888)

Fixes #631 
- Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler.
- Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x'
- Robots.txt bodies are parsed and checked for page allow/disallow status
using the https://github.com/samclarke/robots-parser library, which is
the most active and well-maintained implementation I could find with
TypeScript types.
- Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K
- Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all'
- Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2025-11-26 22:00:06 -05:00 committed by GitHub
parent 75a0c9a305
commit 1d15a155f2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 247 additions and 5 deletions

View file

@ -103,16 +103,16 @@ Options:
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"] atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
[default: []] "robots"] [default: []]
--logExcludeContext Comma-separated list of contexts to --logExcludeContext Comma-separated list of contexts to
NOT include in logs NOT include in logs
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer" [array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"] atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
[default: ["recorderNetwork","jsError","screencast"]] "robots"] [default: ["recorderNetwork","jsError","screencast"]]
--text Extract initial (default) or final t --text Extract initial (default) or final t
ext to pages.jsonl or WARC resource ext to pages.jsonl or WARC resource
record(s) record(s)
@ -324,6 +324,12 @@ Options:
the Chrome instance (space-separated the Chrome instance (space-separated
or multiple --extraChromeArgs) or multiple --extraChromeArgs)
[array] [default: []] [array] [default: []]
--robots If set, fetch and respect page disal
lows specified in per-host robots.tx
t [boolean] [default: false]
--robotsAgent Agent to check in addition to '*' fo
r robots rules
[string] [default: "Browsertrix/1.x"]
--config Path to YAML config file --config Path to YAML config file
``` ```

View file

@ -34,6 +34,7 @@
"pixelmatch": "^5.3.0", "pixelmatch": "^5.3.0",
"pngjs": "^7.0.0", "pngjs": "^7.0.0",
"puppeteer-core": "^24.30.0", "puppeteer-core": "^24.30.0",
"robots-parser": "^3.0.1",
"sax": "^1.3.0", "sax": "^1.3.0",
"sharp": "^0.32.6", "sharp": "^0.32.6",
"tsc": "^2.0.4", "tsc": "^2.0.4",

View file

@ -72,6 +72,7 @@ import {
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js"; import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
import { initProxy } from "./util/proxy.js"; import { initProxy } from "./util/proxy.js";
import { initFlow, nextFlowStep } from "./util/flowbehavior.js"; import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js";
const btrixBehaviors = fs.readFileSync( const btrixBehaviors = fs.readFileSync(
new URL( new URL(
@ -547,6 +548,10 @@ export class Crawler {
this.headers = { "User-Agent": this.configureUA() }; this.headers = { "User-Agent": this.configureUA() };
if (this.params.robots) {
setRobotsConfig(this.headers, this.crawlState);
}
process.on("exit", () => { process.on("exit", () => {
for (const proc of subprocesses) { for (const proc of subprocesses) {
proc.kill(); proc.kill();
@ -2506,6 +2511,18 @@ self.__bx_behaviors.selectMainBehavior();
return false; return false;
} }
if (
this.params.robots &&
(await isDisallowedByRobots(url, logDetails, this.params.robotsAgent))
) {
logger.debug(
"Page URL not queued, disallowed by robots.txt",
{ url, ...logDetails },
"links",
);
return false;
}
const result = await this.crawlState.addToQueue( const result = await this.crawlState.addToQueue(
{ url, seedId, depth, extraHops, ts, pageid }, { url, seedId, depth, extraHops, ts, pageid },
this.pageLimit, this.pageLimit,

View file

@ -704,6 +704,19 @@ class ArgParser {
type: "array", type: "array",
default: [], default: [],
}, },
robots: {
describe:
"If set, fetch and respect page disallows specified in per-host robots.txt",
type: "boolean",
default: false,
},
robotsAgent: {
describe: "Agent to check in addition to '*' for robots rules",
type: "string",
default: "Browsertrix/1.x",
},
}); });
} }

View file

@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5; export const PAGE_OP_TIMEOUT_SECS = 5;
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
export const ROBOTS_CACHE_LIMIT = 100;
export type ExtractSelector = { export type ExtractSelector = {
selector: string; selector: string;
extract: string; extract: string;

View file

@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
"replay", "replay",
"proxy", "proxy",
"scope", "scope",
"robots",
] as const; ] as const;
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];

124
src/util/robots.ts Normal file
View file

@ -0,0 +1,124 @@
import { fetch } from "undici";
import robotsParser, { Robot } from "robots-parser";
import { LogDetails, logger } from "./logger.js";
import { RedisCrawlState } from "./state.js";
import { getProxyDispatcher } from "./proxy.js";
import { timedRun } from "./timing.js";
let headers: Record<string, string> = {};
let crawlState: RedisCrawlState | null = null;
const pendingFetches: Map<string, Promise<string>> = new Map<
string,
Promise<string>
>();
// max seconds to wait to fetch robots
const ROBOTS_FETCH_TIMEOUT = 10;
export function setRobotsConfig(
_headers: Record<string, string>,
state: RedisCrawlState,
) {
headers = _headers;
crawlState = state;
}
export async function isDisallowedByRobots(
url: string,
logDetails: LogDetails,
robotsAgent: string,
) {
const robots = await fetchAndParseRobots(url, logDetails);
return robots && robots.isDisallowed(url, robotsAgent);
}
async function fetchAndParseRobots(
url: string,
logDetails: LogDetails,
): Promise<Robot | null> {
// Fetch robots.txt for url's host and return parser.
// Results are cached by robots.txt URL in Redis using an LRU cache
// implementation that retains the 100 most recently used values.
const urlParser = new URL(url);
const robotsUrl = `${urlParser.origin}/robots.txt`;
const cachedRobots = await crawlState!.getCachedRobots(robotsUrl);
// empty string is valid cached empty robots, so check for null
if (cachedRobots !== null) {
// don't create parser, just skip check if empty string
return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null;
}
try {
let promise = pendingFetches.get(robotsUrl);
if (!promise) {
promise = timedRun(
fetchRobots(robotsUrl, logDetails),
ROBOTS_FETCH_TIMEOUT,
"Fetching Robots timed out",
logDetails,
"robots",
);
pendingFetches.set(robotsUrl, promise);
}
const content = await promise;
if (content === null) {
return null;
}
logger.debug(
"Caching robots.txt body",
{ url: robotsUrl, ...logDetails },
"robots",
);
await crawlState!.setCachedRobots(robotsUrl, content);
// empty string cached, but no need to create parser
return content ? robotsParser(robotsUrl, content) : null;
} catch (e) {
// ignore
} finally {
pendingFetches.delete(robotsUrl);
}
logger.warn(
"Failed to fetch robots.txt",
{
url: robotsUrl,
...logDetails,
},
"robots",
);
return null;
}
async function fetchRobots(
url: string,
logDetails: LogDetails,
): Promise<string | null> {
logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
const resp = await fetch(url, {
headers,
dispatcher: getProxyDispatcher(url),
});
if (resp.ok) {
const buff = await resp.arrayBuffer();
// only decode and store at most 100K
return new TextDecoder().decode(buff.slice(0, 100000));
}
logger.debug(
"Robots.txt invalid, storing empty value",
{ url, status: resp.status },
"robots",
);
// for other status errors, just return empty
return "";
}

View file

@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js"; import {
MAX_DEPTH,
DEFAULT_MAX_RETRIES,
ROBOTS_CACHE_LIMIT,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core"; import { Frame } from "puppeteer-core";
import { interpolateFilename, UploadResult } from "./storage.js"; import { interpolateFilename, UploadResult } from "./storage.js";
@ -200,7 +204,10 @@ export class RedisCrawlState {
fkey: string; fkey: string;
ekey: string; ekey: string;
bkey: string; bkey: string;
rkey: string;
lkey: string;
pageskey: string; pageskey: string;
esKey: string; esKey: string;
esMap: string; esMap: string;
@ -233,6 +240,10 @@ export class RedisCrawlState {
this.ekey = this.key + ":e"; this.ekey = this.key + ":e";
// crawler behavior script messages // crawler behavior script messages
this.bkey = this.key + ":b"; this.bkey = this.key + ":b";
// cached robots.txt bodies (per-origin)
this.rkey = this.key + ":r";
// LRU cache of robots.txt keys
this.lkey = this.key + ":l";
// pages // pages
this.pageskey = this.key + ":pages"; this.pageskey = this.key + ":pages";
@ -1025,6 +1036,38 @@ return inx;
return await this.redis.lpush(this.bkey, behaviorLog); return await this.redis.lpush(this.bkey, behaviorLog);
} }
async _updateRobotsAccessTime(robotsUrl: string) {
const accessTime = Date.now();
await this.redis.zadd(this.lkey, accessTime, robotsUrl);
}
async setCachedRobots(robotsUrl: string, body: string) {
await this._updateRobotsAccessTime(robotsUrl);
await this.redis.set(`${this.rkey}:${robotsUrl}`, body);
// prune least-recently used items in zset and robots cache if over limit
const cacheCount = await this.redis.zcard(this.lkey);
if (cacheCount > ROBOTS_CACHE_LIMIT) {
const diff = cacheCount - ROBOTS_CACHE_LIMIT;
const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);
for (const keyToDelete of keysToDelete) {
logger.debug(
"Deleting cached robots.txt, over cache limit",
{ url: keyToDelete },
"robots",
);
await this.redis.del(`${this.rkey}:${keyToDelete}`);
await this.redis.zrem(this.lkey, keyToDelete);
}
}
}
async getCachedRobots(robotsUrl: string) {
await this._updateRobotsAccessTime(robotsUrl);
return await this.redis.get(`${this.rkey}:${robotsUrl}`);
}
async writeToPagesQueue( async writeToPagesQueue(
data: Record<string, string | number | boolean | object>, data: Record<string, string | number | boolean | object>,
) { ) {

35
tests/robots_txt.test.js Normal file
View file

@ -0,0 +1,35 @@
import child_process from "child_process";
test("test robots.txt is fetched and cached", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
);
const log = res.toString();
// robots.txt not found
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
) > 0,
).toBe(true);
// robots.txt found and cached
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
});