Add option to respect robots.txt disallows (#888)

Fixes #631 
- Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler.
- Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x'
- Robots.txt bodies are parsed and checked for page allow/disallow status
using the https://github.com/samclarke/robots-parser library, which is
the most active and well-maintained implementation I could find with
TypeScript types.
- Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K
- Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all'
- Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2025-11-26 22:00:06 -05:00 committed by GitHub
parent 75a0c9a305
commit 1d15a155f2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 247 additions and 5 deletions

View file

@ -103,16 +103,16 @@ Options:
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
[default: []]
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
"robots"] [default: []]
--logExcludeContext Comma-separated list of contexts to
NOT include in logs
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
[default: ["recorderNetwork","jsError","screencast"]]
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
"robots"] [default: ["recorderNetwork","jsError","screencast"]]
--text Extract initial (default) or final t
ext to pages.jsonl or WARC resource
record(s)
@ -324,6 +324,12 @@ Options:
the Chrome instance (space-separated
or multiple --extraChromeArgs)
[array] [default: []]
--robots If set, fetch and respect page disal
lows specified in per-host robots.tx
t [boolean] [default: false]
--robotsAgent Agent to check in addition to '*' fo
r robots rules
[string] [default: "Browsertrix/1.x"]
--config Path to YAML config file
```

View file

@ -34,6 +34,7 @@
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^24.30.0",
"robots-parser": "^3.0.1",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",

View file

@ -72,6 +72,7 @@ import {
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
import { initProxy } from "./util/proxy.js";
import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js";
const btrixBehaviors = fs.readFileSync(
new URL(
@ -547,6 +548,10 @@ export class Crawler {
this.headers = { "User-Agent": this.configureUA() };
if (this.params.robots) {
setRobotsConfig(this.headers, this.crawlState);
}
process.on("exit", () => {
for (const proc of subprocesses) {
proc.kill();
@ -2506,6 +2511,18 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}
if (
this.params.robots &&
(await isDisallowedByRobots(url, logDetails, this.params.robotsAgent))
) {
logger.debug(
"Page URL not queued, disallowed by robots.txt",
{ url, ...logDetails },
"links",
);
return false;
}
const result = await this.crawlState.addToQueue(
{ url, seedId, depth, extraHops, ts, pageid },
this.pageLimit,

View file

@ -704,6 +704,19 @@ class ArgParser {
type: "array",
default: [],
},
robots: {
describe:
"If set, fetch and respect page disallows specified in per-host robots.txt",
type: "boolean",
default: false,
},
robotsAgent: {
describe: "Agent to check in addition to '*' for robots rules",
type: "string",
default: "Browsertrix/1.x",
},
});
}

View file

@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5;
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
export const ROBOTS_CACHE_LIMIT = 100;
export type ExtractSelector = {
selector: string;
extract: string;

View file

@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
"replay",
"proxy",
"scope",
"robots",
] as const;
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];

124
src/util/robots.ts Normal file
View file

@ -0,0 +1,124 @@
import { fetch } from "undici";
import robotsParser, { Robot } from "robots-parser";
import { LogDetails, logger } from "./logger.js";
import { RedisCrawlState } from "./state.js";
import { getProxyDispatcher } from "./proxy.js";
import { timedRun } from "./timing.js";
let headers: Record<string, string> = {};
let crawlState: RedisCrawlState | null = null;
const pendingFetches: Map<string, Promise<string>> = new Map<
string,
Promise<string>
>();
// max seconds to wait to fetch robots
const ROBOTS_FETCH_TIMEOUT = 10;
export function setRobotsConfig(
_headers: Record<string, string>,
state: RedisCrawlState,
) {
headers = _headers;
crawlState = state;
}
export async function isDisallowedByRobots(
url: string,
logDetails: LogDetails,
robotsAgent: string,
) {
const robots = await fetchAndParseRobots(url, logDetails);
return robots && robots.isDisallowed(url, robotsAgent);
}
async function fetchAndParseRobots(
url: string,
logDetails: LogDetails,
): Promise<Robot | null> {
// Fetch robots.txt for url's host and return parser.
// Results are cached by robots.txt URL in Redis using an LRU cache
// implementation that retains the 100 most recently used values.
const urlParser = new URL(url);
const robotsUrl = `${urlParser.origin}/robots.txt`;
const cachedRobots = await crawlState!.getCachedRobots(robotsUrl);
// empty string is valid cached empty robots, so check for null
if (cachedRobots !== null) {
// don't create parser, just skip check if empty string
return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null;
}
try {
let promise = pendingFetches.get(robotsUrl);
if (!promise) {
promise = timedRun(
fetchRobots(robotsUrl, logDetails),
ROBOTS_FETCH_TIMEOUT,
"Fetching Robots timed out",
logDetails,
"robots",
);
pendingFetches.set(robotsUrl, promise);
}
const content = await promise;
if (content === null) {
return null;
}
logger.debug(
"Caching robots.txt body",
{ url: robotsUrl, ...logDetails },
"robots",
);
await crawlState!.setCachedRobots(robotsUrl, content);
// empty string cached, but no need to create parser
return content ? robotsParser(robotsUrl, content) : null;
} catch (e) {
// ignore
} finally {
pendingFetches.delete(robotsUrl);
}
logger.warn(
"Failed to fetch robots.txt",
{
url: robotsUrl,
...logDetails,
},
"robots",
);
return null;
}
async function fetchRobots(
url: string,
logDetails: LogDetails,
): Promise<string | null> {
logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
const resp = await fetch(url, {
headers,
dispatcher: getProxyDispatcher(url),
});
if (resp.ok) {
const buff = await resp.arrayBuffer();
// only decode and store at most 100K
return new TextDecoder().decode(buff.slice(0, 100000));
}
logger.debug(
"Robots.txt invalid, storing empty value",
{ url, status: resp.status },
"robots",
);
// for other status errors, just return empty
return "";
}

View file

@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";
import { logger } from "./logger.js";
import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
import {
MAX_DEPTH,
DEFAULT_MAX_RETRIES,
ROBOTS_CACHE_LIMIT,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";
import { interpolateFilename, UploadResult } from "./storage.js";
@ -200,7 +204,10 @@ export class RedisCrawlState {
fkey: string;
ekey: string;
bkey: string;
rkey: string;
lkey: string;
pageskey: string;
esKey: string;
esMap: string;
@ -233,6 +240,10 @@ export class RedisCrawlState {
this.ekey = this.key + ":e";
// crawler behavior script messages
this.bkey = this.key + ":b";
// cached robots.txt bodies (per-origin)
this.rkey = this.key + ":r";
// LRU cache of robots.txt keys
this.lkey = this.key + ":l";
// pages
this.pageskey = this.key + ":pages";
@ -1025,6 +1036,38 @@ return inx;
return await this.redis.lpush(this.bkey, behaviorLog);
}
async _updateRobotsAccessTime(robotsUrl: string) {
const accessTime = Date.now();
await this.redis.zadd(this.lkey, accessTime, robotsUrl);
}
async setCachedRobots(robotsUrl: string, body: string) {
await this._updateRobotsAccessTime(robotsUrl);
await this.redis.set(`${this.rkey}:${robotsUrl}`, body);
// prune least-recently used items in zset and robots cache if over limit
const cacheCount = await this.redis.zcard(this.lkey);
if (cacheCount > ROBOTS_CACHE_LIMIT) {
const diff = cacheCount - ROBOTS_CACHE_LIMIT;
const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);
for (const keyToDelete of keysToDelete) {
logger.debug(
"Deleting cached robots.txt, over cache limit",
{ url: keyToDelete },
"robots",
);
await this.redis.del(`${this.rkey}:${keyToDelete}`);
await this.redis.zrem(this.lkey, keyToDelete);
}
}
}
async getCachedRobots(robotsUrl: string) {
await this._updateRobotsAccessTime(robotsUrl);
return await this.redis.get(`${this.rkey}:${robotsUrl}`);
}
async writeToPagesQueue(
data: Record<string, string | number | boolean | object>,
) {

35
tests/robots_txt.test.js Normal file
View file

@ -0,0 +1,35 @@
import child_process from "child_process";
test("test robots.txt is fetched and cached", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
);
const log = res.toString();
// robots.txt not found
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
) > 0,
).toBe(true);
// robots.txt found and cached
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
});