Add option to respect robots.txt disallows (#888)

Fixes #631 - Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler. - Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x' - Robots.txt bodies are parsed and checked for page allow/disallow status using the https://github.com/samclarke/robots-parser library, which is the most active and well-maintained implementation I could find with TypeScript types. - Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K - Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all' - Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-12-07 13:49:47 +00:00 · 2025-11-26 22:00:06 -05:00 · 2025-11-26 22:00:06 -05:00 · 1d15a155f2
commit 1d15a155f2
parent 75a0c9a305
9 changed files with 247 additions and 5 deletions
--- a/docs/docs/user-guide/cli-options.md
+++ b/docs/docs/user-guide/cli-options.md
@ -103,16 +103,16 @@ Options:
  , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
  ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
  orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
-   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
-                                                                   [default: []]
+   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
+                                                         "robots"] [default: []]
      --logExcludeContext                   Comma-separated list of contexts to
                                            NOT include in logs
  [array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
  , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
  ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
  orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
-   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
-                           [default: ["recorderNetwork","jsError","screencast"]]
+   atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
+                 "robots"] [default: ["recorderNetwork","jsError","screencast"]]
      --text                                Extract initial (default) or final t
                                            ext to pages.jsonl or WARC resource
                                            record(s)
@ -324,6 +324,12 @@ Options:
                                            the Chrome instance (space-separated
                                             or multiple --extraChromeArgs)
                                                           [array] [default: []]
+      --robots                              If set, fetch and respect page disal
+                                            lows specified in per-host robots.tx
+                                            t         [boolean] [default: false]
+      --robotsAgent                         Agent to check in addition to '*' fo
+                                            r robots rules
+                                           [string] [default: "Browsertrix/1.x"]
      --config                              Path to YAML config file
 ```

--- a/package.json
+++ b/package.json
@ -34,6 +34,7 @@
    "pixelmatch": "^5.3.0",
    "pngjs": "^7.0.0",
    "puppeteer-core": "^24.30.0",
+    "robots-parser": "^3.0.1",
    "sax": "^1.3.0",
    "sharp": "^0.32.6",
    "tsc": "^2.0.4",
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -72,6 +72,7 @@ import {
 import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
 import { initProxy } from "./util/proxy.js";
 import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
+import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js";

 const btrixBehaviors = fs.readFileSync(
  new URL(
@ -547,6 +548,10 @@ export class Crawler {

    this.headers = { "User-Agent": this.configureUA() };

+    if (this.params.robots) {
+      setRobotsConfig(this.headers, this.crawlState);
+    }
+
    process.on("exit", () => {
      for (const proc of subprocesses) {
        proc.kill();
@ -2506,6 +2511,18 @@ self.__bx_behaviors.selectMainBehavior();
      return false;
    }

+    if (
+      this.params.robots &&
+      (await isDisallowedByRobots(url, logDetails, this.params.robotsAgent))
+    ) {
+      logger.debug(
+        "Page URL not queued, disallowed by robots.txt",
+        { url, ...logDetails },
+        "links",
+      );
+      return false;
+    }
+
    const result = await this.crawlState.addToQueue(
      { url, seedId, depth, extraHops, ts, pageid },
      this.pageLimit,
--- a/src/util/argParser.ts
+++ b/src/util/argParser.ts
@ -704,6 +704,19 @@ class ArgParser {
          type: "array",
          default: [],
        },
+
+        robots: {
+          describe:
+            "If set, fetch and respect page disallows specified in per-host robots.txt",
+          type: "boolean",
+          default: false,
+        },
+
+        robotsAgent: {
+          describe: "Agent to check in addition to '*' for robots rules",
+          type: "string",
+          default: "Browsertrix/1.x",
+        },
      });
  }

--- a/src/util/constants.ts
+++ b/src/util/constants.ts
@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
 export const PAGE_OP_TIMEOUT_SECS = 5;
 export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;

+export const ROBOTS_CACHE_LIMIT = 100;
+
 export type ExtractSelector = {
  selector: string;
  extract: string;
--- a/src/util/logger.ts
+++ b/src/util/logger.ts
@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
  "replay",
  "proxy",
  "scope",
+  "robots",
 ] as const;

 export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
--- a/src/util/robots.ts
+++ b/src/util/robots.ts
@ -0,0 +1,124 @@
+import { fetch } from "undici";
+import robotsParser, { Robot } from "robots-parser";
+
+import { LogDetails, logger } from "./logger.js";
+import { RedisCrawlState } from "./state.js";
+import { getProxyDispatcher } from "./proxy.js";
+import { timedRun } from "./timing.js";
+
+let headers: Record<string, string> = {};
+let crawlState: RedisCrawlState | null = null;
+
+const pendingFetches: Map<string, Promise<string>> = new Map<
+  string,
+  Promise<string>
+>();
+
+// max seconds to wait to fetch robots
+const ROBOTS_FETCH_TIMEOUT = 10;
+
+export function setRobotsConfig(
+  _headers: Record<string, string>,
+  state: RedisCrawlState,
+) {
+  headers = _headers;
+  crawlState = state;
+}
+
+export async function isDisallowedByRobots(
+  url: string,
+  logDetails: LogDetails,
+  robotsAgent: string,
+) {
+  const robots = await fetchAndParseRobots(url, logDetails);
+  return robots && robots.isDisallowed(url, robotsAgent);
+}
+
+async function fetchAndParseRobots(
+  url: string,
+  logDetails: LogDetails,
+): Promise<Robot | null> {
+  // Fetch robots.txt for url's host and return parser.
+  // Results are cached by robots.txt URL in Redis using an LRU cache
+  // implementation that retains the 100 most recently used values.
+  const urlParser = new URL(url);
+  const robotsUrl = `${urlParser.origin}/robots.txt`;
+
+  const cachedRobots = await crawlState!.getCachedRobots(robotsUrl);
+  // empty string is valid cached empty robots, so check for null
+  if (cachedRobots !== null) {
+    // don't create parser, just skip check if empty string
+    return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null;
+  }
+
+  try {
+    let promise = pendingFetches.get(robotsUrl);
+
+    if (!promise) {
+      promise = timedRun(
+        fetchRobots(robotsUrl, logDetails),
+        ROBOTS_FETCH_TIMEOUT,
+        "Fetching Robots timed out",
+        logDetails,
+        "robots",
+      );
+      pendingFetches.set(robotsUrl, promise);
+    }
+
+    const content = await promise;
+
+    if (content === null) {
+      return null;
+    }
+
+    logger.debug(
+      "Caching robots.txt body",
+      { url: robotsUrl, ...logDetails },
+      "robots",
+    );
+    await crawlState!.setCachedRobots(robotsUrl, content);
+
+    // empty string cached, but no need to create parser
+    return content ? robotsParser(robotsUrl, content) : null;
+  } catch (e) {
+    // ignore
+  } finally {
+    pendingFetches.delete(robotsUrl);
+  }
+  logger.warn(
+    "Failed to fetch robots.txt",
+    {
+      url: robotsUrl,
+      ...logDetails,
+    },
+    "robots",
+  );
+  return null;
+}
+
+async function fetchRobots(
+  url: string,
+  logDetails: LogDetails,
+): Promise<string | null> {
+  logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
+
+  const resp = await fetch(url, {
+    headers,
+    dispatcher: getProxyDispatcher(url),
+  });
+
+  if (resp.ok) {
+    const buff = await resp.arrayBuffer();
+    // only decode and store at most 100K
+    return new TextDecoder().decode(buff.slice(0, 100000));
+  }
+
+  logger.debug(
+    "Robots.txt invalid, storing empty value",
+    { url, status: resp.status },
+    "robots",
+  );
+
+  // for other status errors, just return empty
+  return "";
+}
--- a/src/util/state.ts
+++ b/src/util/state.ts
@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";

 import { logger } from "./logger.js";

-import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
+import {
+  MAX_DEPTH,
+  DEFAULT_MAX_RETRIES,
+  ROBOTS_CACHE_LIMIT,
+} from "./constants.js";
 import { ScopedSeed } from "./seeds.js";
 import { Frame } from "puppeteer-core";
 import { interpolateFilename, UploadResult } from "./storage.js";
@ -200,7 +204,10 @@ export class RedisCrawlState {
  fkey: string;
  ekey: string;
  bkey: string;
+  rkey: string;
+  lkey: string;
  pageskey: string;
+
  esKey: string;
  esMap: string;

@ -233,6 +240,10 @@ export class RedisCrawlState {
    this.ekey = this.key + ":e";
    // crawler behavior script messages
    this.bkey = this.key + ":b";
+    // cached robots.txt bodies (per-origin)
+    this.rkey = this.key + ":r";
+    // LRU cache of robots.txt keys
+    this.lkey = this.key + ":l";
    // pages
    this.pageskey = this.key + ":pages";

@ -1025,6 +1036,38 @@ return inx;
    return await this.redis.lpush(this.bkey, behaviorLog);
  }

+  async _updateRobotsAccessTime(robotsUrl: string) {
+    const accessTime = Date.now();
+    await this.redis.zadd(this.lkey, accessTime, robotsUrl);
+  }
+
+  async setCachedRobots(robotsUrl: string, body: string) {
+    await this._updateRobotsAccessTime(robotsUrl);
+    await this.redis.set(`${this.rkey}:${robotsUrl}`, body);
+
+    // prune least-recently used items in zset and robots cache if over limit
+    const cacheCount = await this.redis.zcard(this.lkey);
+    if (cacheCount > ROBOTS_CACHE_LIMIT) {
+      const diff = cacheCount - ROBOTS_CACHE_LIMIT;
+      const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);
+
+      for (const keyToDelete of keysToDelete) {
+        logger.debug(
+          "Deleting cached robots.txt, over cache limit",
+          { url: keyToDelete },
+          "robots",
+        );
+        await this.redis.del(`${this.rkey}:${keyToDelete}`);
+        await this.redis.zrem(this.lkey, keyToDelete);
+      }
+    }
+  }
+
+  async getCachedRobots(robotsUrl: string) {
+    await this._updateRobotsAccessTime(robotsUrl);
+    return await this.redis.get(`${this.rkey}:${robotsUrl}`);
+  }
+
  async writeToPagesQueue(
    data: Record<string, string | number | boolean | object>,
  ) {
--- a/tests/robots_txt.test.js
+++ b/tests/robots_txt.test.js
@ -0,0 +1,35 @@
+import child_process from "child_process";
+
+test("test robots.txt is fetched and cached", async () => {
+  const res = child_process.execSync(
+    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
+  );
+
+  const log = res.toString();
+
+  // robots.txt not found
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
+    ) > 0,
+  ).toBe(true);
+
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
+    ) > 0,
+  ).toBe(true);
+
+  // robots.txt found and cached
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
+    ) > 0,
+  ).toBe(true);
+
+  expect(
+    log.indexOf(
+      '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
+    ) > 0,
+  ).toBe(true);
+});