Add option to respect robots.txt disallows (#888)

Fixes #631 - Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler. - Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x' - Robots.txt bodies are parsed and checked for page allow/disallow status using the https://github.com/samclarke/robots-parser library, which is the most active and well-maintained implementation I could find with TypeScript types. - Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K - Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all' - Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-12-08 06:09:48 +00:00 · 2025-11-26 22:00:06 -05:00 · 2025-11-26 22:00:06 -05:00 · 1d15a155f2
commit 1d15a155f2
parent 75a0c9a305
9 changed files with 247 additions and 5 deletions
--- a/docs/docs/user-guide/cli-options.md
+++ b/docs/docs/user-guide/cli-options.md
@ -103,16 +103,16 @@ Options:
  , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
  ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
  orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
-   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
+   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
-                                                                   [default: []]
+                                                         "robots"] [default: []]
      --logExcludeContext                   Comma-separated list of contexts to
                                            NOT include in logs
  [array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
  , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
  ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
  orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
-   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
+   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
-                           [default: ["recorderNetwork","jsError","screencast"]]
+                 "robots"] [default: ["recorderNetwork","jsError","screencast"]]
      --text                                Extract initial (default) or final t
                                            ext to pages.jsonl or WARC resource
                                            record(s)
@ -324,6 +324,12 @@ Options:
                                            the Chrome instance (space-separated
                                             or multiple --extraChromeArgs)
                                                           [array] [default: []]
      --robots                              If set, fetch and respect page disal
                                            lows specified in per-host robots.tx
                                            t         [boolean] [default: false]
      --robotsAgent                         Agent to check in addition to '*' fo
                                            r robots rules
                                           [string] [default: "Browsertrix/1.x"]
      --config                              Path to YAML config file
 ```
--- a/package.json
+++ b/package.json
@ -34,6 +34,7 @@
    "pixelmatch": "^5.3.0",
    "pngjs": "^7.0.0",
    "puppeteer-core": "^24.30.0",
    "robots-parser": "^3.0.1",
    "sax": "^1.3.0",
    "sharp": "^0.32.6",
    "tsc": "^2.0.4",
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -72,6 +72,7 @@ import {
 import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
 import { initProxy } from "./util/proxy.js";
 import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
 import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js";
 const btrixBehaviors = fs.readFileSync(
  new URL(
@ -547,6 +548,10 @@ export class Crawler {
    this.headers = { "User-Agent": this.configureUA() };
    if (this.params.robots) {
      setRobotsConfig(this.headers, this.crawlState);
    }
    process.on("exit", () => {
      for (const proc of subprocesses) {
        proc.kill();
@ -2506,6 +2511,18 @@ self.__bx_behaviors.selectMainBehavior();
      return false;
    }
    if (
      this.params.robots &&
      (await isDisallowedByRobots(url, logDetails, this.params.robotsAgent))
    ) {
      logger.debug(
        "Page URL not queued, disallowed by robots.txt",
        { url, ...logDetails },
        "links",
      );
      return false;
    }
    const result = await this.crawlState.addToQueue(
      { url, seedId, depth, extraHops, ts, pageid },
      this.pageLimit,
--- a/src/util/argParser.ts
+++ b/src/util/argParser.ts
@ -704,6 +704,19 @@ class ArgParser {
          type: "array",
          default: [],
        },
        robots: {
          describe:
            "If set, fetch and respect page disallows specified in per-host robots.txt",
          type: "boolean",
          default: false,
        },
        robotsAgent: {
          describe: "Agent to check in addition to '*' for robots rules",
          type: "string",
          default: "Browsertrix/1.x",
        },
      });
  }
--- a/src/util/constants.ts
+++ b/src/util/constants.ts
@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
 export const PAGE_OP_TIMEOUT_SECS = 5;
 export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
 export const ROBOTS_CACHE_LIMIT = 100;
 export type ExtractSelector = {
  selector: string;
  extract: string;
--- a/src/util/logger.ts
+++ b/src/util/logger.ts
@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
  "replay",
  "proxy",
  "scope",
  "robots",
 ] as const;
 export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
--- a/src/util/robots.ts
+++ b/src/util/robots.ts
@ -0,0 +1,124 @@
 import { fetch } from "undici";
 import robotsParser, { Robot } from "robots-parser";
 import { LogDetails, logger } from "./logger.js";
 import { RedisCrawlState } from "./state.js";
 import { getProxyDispatcher } from "./proxy.js";
 import { timedRun } from "./timing.js";
 let headers: Record<string, string> = {};
 let crawlState: RedisCrawlState | null = null;
 const pendingFetches: Map<string, Promise<string>> = new Map<
  string,
  Promise<string>
 >();
 // max seconds to wait to fetch robots
 const ROBOTS_FETCH_TIMEOUT = 10;
 export function setRobotsConfig(
  _headers: Record<string, string>,
  state: RedisCrawlState,
 ) {
  headers = _headers;
  crawlState = state;
 }
 export async function isDisallowedByRobots(
  url: string,
  logDetails: LogDetails,
  robotsAgent: string,
 ) {
  const robots = await fetchAndParseRobots(url, logDetails);
  return robots && robots.isDisallowed(url, robotsAgent);
 }
 async function fetchAndParseRobots(
  url: string,
  logDetails: LogDetails,
 ): Promise<Robot | null> {
  // Fetch robots.txt for url's host and return parser.
  // Results are cached by robots.txt URL in Redis using an LRU cache
  // implementation that retains the 100 most recently used values.
  const urlParser = new URL(url);
  const robotsUrl = `${urlParser.origin}/robots.txt`;
  const cachedRobots = await crawlState!.getCachedRobots(robotsUrl);
  // empty string is valid cached empty robots, so check for null
  if (cachedRobots !== null) {
    // don't create parser, just skip check if empty string
    return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null;
  }
  try {
    let promise = pendingFetches.get(robotsUrl);
    if (!promise) {
      promise = timedRun(
        fetchRobots(robotsUrl, logDetails),
        ROBOTS_FETCH_TIMEOUT,
        "Fetching Robots timed out",
        logDetails,
        "robots",
      );
      pendingFetches.set(robotsUrl, promise);
    }
    const content = await promise;
    if (content === null) {
      return null;
    }
    logger.debug(
      "Caching robots.txt body",
      { url: robotsUrl, ...logDetails },
      "robots",
    );
    await crawlState!.setCachedRobots(robotsUrl, content);
    // empty string cached, but no need to create parser
    return content ? robotsParser(robotsUrl, content) : null;
  } catch (e) {
    // ignore
  } finally {
    pendingFetches.delete(robotsUrl);
  }
  logger.warn(
    "Failed to fetch robots.txt",
    {
      url: robotsUrl,
      ...logDetails,
    },
    "robots",
  );
  return null;
 }
 async function fetchRobots(
  url: string,
  logDetails: LogDetails,
 ): Promise<string | null> {
  logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
  const resp = await fetch(url, {
    headers,
    dispatcher: getProxyDispatcher(url),
  });
  if (resp.ok) {
    const buff = await resp.arrayBuffer();
    // only decode and store at most 100K
    return new TextDecoder().decode(buff.slice(0, 100000));
  }
  logger.debug(
    "Robots.txt invalid, storing empty value",
    { url, status: resp.status },
    "robots",
  );
  // for other status errors, just return empty
  return "";
 }
--- a/src/util/state.ts
+++ b/src/util/state.ts
@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";
 import { logger } from "./logger.js";
-import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
+import {
  MAX_DEPTH,
  DEFAULT_MAX_RETRIES,
  ROBOTS_CACHE_LIMIT,
 } from "./constants.js";
 import { ScopedSeed } from "./seeds.js";
 import { Frame } from "puppeteer-core";
 import { interpolateFilename, UploadResult } from "./storage.js";
@ -200,7 +204,10 @@ export class RedisCrawlState {
  fkey: string;
  ekey: string;
  bkey: string;
  rkey: string;
  lkey: string;
  pageskey: string;
  esKey: string;
  esMap: string;
@ -233,6 +240,10 @@ export class RedisCrawlState {
    this.ekey = this.key + ":e";
    // crawler behavior script messages
    this.bkey = this.key + ":b";
    // cached robots.txt bodies (per-origin)
    this.rkey = this.key + ":r";
    // LRU cache of robots.txt keys
    this.lkey = this.key + ":l";
    // pages
    this.pageskey = this.key + ":pages";
@ -1025,6 +1036,38 @@ return inx;
    return await this.redis.lpush(this.bkey, behaviorLog);
  }
  async _updateRobotsAccessTime(robotsUrl: string) {
    const accessTime = Date.now();
    await this.redis.zadd(this.lkey, accessTime, robotsUrl);
  }
  async setCachedRobots(robotsUrl: string, body: string) {
    await this._updateRobotsAccessTime(robotsUrl);
    await this.redis.set(`${this.rkey}:${robotsUrl}`, body);
    // prune least-recently used items in zset and robots cache if over limit
    const cacheCount = await this.redis.zcard(this.lkey);
    if (cacheCount > ROBOTS_CACHE_LIMIT) {
      const diff = cacheCount - ROBOTS_CACHE_LIMIT;
      const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);
      for (const keyToDelete of keysToDelete) {
        logger.debug(
          "Deleting cached robots.txt, over cache limit",
          { url: keyToDelete },
          "robots",
        );
        await this.redis.del(`${this.rkey}:${keyToDelete}`);
        await this.redis.zrem(this.lkey, keyToDelete);
      }
    }
  }
  async getCachedRobots(robotsUrl: string) {
    await this._updateRobotsAccessTime(robotsUrl);
    return await this.redis.get(`${this.rkey}:${robotsUrl}`);
  }
  async writeToPagesQueue(
    data: Record<string, string | number | boolean | object>,
  ) {
--- a/tests/robots_txt.test.js
+++ b/tests/robots_txt.test.js
@ -0,0 +1,35 @@
 import child_process from "child_process";
 test("test robots.txt is fetched and cached", async () => {
  const res = child_process.execSync(
    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
  );
  const log = res.toString();
  // robots.txt not found
  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
    ) > 0,
  ).toBe(true);
  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
    ) > 0,
  ).toBe(true);
  // robots.txt found and cached
  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
    ) > 0,
  ).toBe(true);
  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
    ) > 0,
  ).toBe(true);
 });