2025-11-26 22:00:06 -05:00
|
|
|
import child_process from "child_process";
|
|
|
|
|
|
|
|
|
|
test("test robots.txt is fetched and cached", async () => {
|
|
|
|
|
const res = child_process.execSync(
|
2025-12-02 18:55:25 -05:00
|
|
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --useRobots --logging debug",
|
2025-11-26 22:00:06 -05:00
|
|
|
);
|
|
|
|
|
|
|
|
|
|
const log = res.toString();
|
|
|
|
|
|
|
|
|
|
// robots.txt not found
|
|
|
|
|
expect(
|
|
|
|
|
log.indexOf(
|
|
|
|
|
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
|
|
|
|
|
) > 0,
|
|
|
|
|
).toBe(true);
|
|
|
|
|
|
|
|
|
|
expect(
|
|
|
|
|
log.indexOf(
|
|
|
|
|
'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
|
|
|
|
|
) > 0,
|
|
|
|
|
).toBe(true);
|
|
|
|
|
|
|
|
|
|
// robots.txt found and cached
|
|
|
|
|
expect(
|
|
|
|
|
log.indexOf(
|
|
|
|
|
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
|
|
|
|
|
) > 0,
|
|
|
|
|
).toBe(true);
|
|
|
|
|
|
|
|
|
|
expect(
|
|
|
|
|
log.indexOf(
|
|
|
|
|
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
|
|
|
|
|
) > 0,
|
|
|
|
|
).toBe(true);
|
|
|
|
|
});
|