browsertrix-crawler/tests/robots_txt.test.js
Tessa Walsh 1d15a155f2
Add option to respect robots.txt disallows (#888)
Fixes #631 
- Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler.
- Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x'
- Robots.txt bodies are parsed and checked for page allow/disallow status
using the https://github.com/samclarke/robots-parser library, which is
the most active and well-maintained implementation I could find with
TypeScript types.
- Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K
- Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all'
- Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-11-26 19:00:06 -08:00

35 lines
1.2 KiB
JavaScript

import child_process from "child_process";
test("test robots.txt is fetched and cached", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
);
const log = res.toString();
// robots.txt not found
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
) > 0,
).toBe(true);
// robots.txt found and cached
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
});