mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
tests: update excude redirect test to test that extra urn:page records are not written for excluded-on-redirect page
This commit is contained in:
parent
fa786f8f35
commit
ba4471f25b
1 changed files with 30 additions and 1 deletions
|
|
@ -6,7 +6,7 @@ import { execSync } from "child_process";
|
||||||
|
|
||||||
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
|
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
|
||||||
execSync(
|
execSync(
|
||||||
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
|
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
|
||||||
|
|
||||||
// no entries besides header
|
// no entries besides header
|
||||||
expect(
|
expect(
|
||||||
|
|
@ -19,3 +19,32 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => {
|
||||||
|
execSync(
|
||||||
|
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX");
|
||||||
|
|
||||||
|
|
||||||
|
// no entries besides header
|
||||||
|
expect(
|
||||||
|
fs
|
||||||
|
.readFileSync(
|
||||||
|
"test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
).trim().split("\n").length
|
||||||
|
).toBe(1);
|
||||||
|
|
||||||
|
|
||||||
|
const data = fs.readFileSync(
|
||||||
|
"test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj",
|
||||||
|
{ encoding: "utf-8" },
|
||||||
|
);
|
||||||
|
|
||||||
|
// expect one occurence
|
||||||
|
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
|
||||||
|
expect(first > 0).toBe(true);
|
||||||
|
|
||||||
|
// expect no other occurences
|
||||||
|
expect(data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`, first + 1)).toBe(-1);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue