Add tests for robots.txt being fetched and cached

Does not yet include testing that a page URL disallowed by robots
is not queued, as I haven't yet been able to find a Webrecorder-
managed site with a robots.txt with disallows to test against.
This commit is contained in:
Tessa Walsh 2025-09-30 11:08:00 -04:00
parent f192e798c5
commit 004c4ebd9b

35
tests/robots_txt.test.js Normal file
View file

@ -0,0 +1,35 @@
import child_process from "child_process";
test("test robots.txt is fetched and cached", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
);
const log = res.toString();
// robots.txt not found
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
) > 0,
).toBe(true);
// robots.txt found and cached
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
});