mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add tests for robots.txt being fetched and cached
Does not yet include testing that a page URL disallowed by robots is not queued, as I haven't yet been able to find a Webrecorder- managed site with a robots.txt with disallows to test against.
This commit is contained in:
parent
f192e798c5
commit
004c4ebd9b
1 changed files with 35 additions and 0 deletions
35
tests/robots_txt.test.js
Normal file
35
tests/robots_txt.test.js
Normal file
|
@ -0,0 +1,35 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
test("test robots.txt is fetched and cached", async () => {
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// robots.txt not found
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// robots.txt found and cached
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
});
|
Loading…
Add table
Add a link
Reference in a new issue