browsertrix-crawler/tests/robots_txt.test.js

import child_process from "child_process";

test("test robots.txt is fetched and cached", async () => {
  const res = child_process.execSync(
    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --useRobots --logging debug",
  );

  const log = res.toString();

  // robots.txt not found
  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
    ) > 0,
  ).toBe(true);

  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
    ) > 0,
  ).toBe(true);

  // robots.txt found and cached
  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
    ) > 0,
  ).toBe(true);

  expect(
    log.indexOf(
      '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
    ) > 0,
  ).toBe(true);
});
Add option to respect robots.txt disallows (#888) Fixes #631 - Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler. - Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x' - Robots.txt bodies are parsed and checked for page allow/disallow status using the https://github.com/samclarke/robots-parser library, which is the most active and well-maintained implementation I could find with TypeScript types. - Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K - Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all' - Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> 2025-11-26 22:00:06 -05:00			`import child_process from "child_process";`

			`test("test robots.txt is fetched and cached", async () => {`
			`const res = child_process.execSync(`
Rename robots flag to --useRobots, keep --robots as alias (#932) Follow-up to https://github.com/webrecorder/browsertrix-crawler/issues/631 Based on feedback from https://github.com/webrecorder/browsertrix/pull/3029 Renaming `--robots` to `--useRobots` will allow us to keep the Browsertrix backend API more consistent with similar flags like `--useSitemap`. Keeping `--robots` as it's a nice shorthand alias. 2025-12-02 18:55:25 -05:00			`"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --useRobots --logging debug",`
Add option to respect robots.txt disallows (#888) Fixes #631 - Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler. - Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x' - Robots.txt bodies are parsed and checked for page allow/disallow status using the https://github.com/samclarke/robots-parser library, which is the most active and well-maintained implementation I could find with TypeScript types. - Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K - Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all' - Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> 2025-11-26 22:00:06 -05:00			`);`

			`const log = res.toString();`

			`// robots.txt not found`
			`expect(`
			`log.indexOf(`
			`'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',`
			`) > 0,`
			`).toBe(true);`

			`expect(`
			`log.indexOf(`
			`'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',`
			`) > 0,`
			`).toBe(true);`

			`// robots.txt found and cached`
			`expect(`
			`log.indexOf(`
			`'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',`
			`) > 0,`
			`).toBe(true);`

			`expect(`
			`log.indexOf(`
			`'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',`
			`) > 0,`
			`).toBe(true);`
			`});`