diff --git a/tests/exclude-redirected.test.js b/tests/exclude-redirected.test.js index b81a0ef8..db33cadf 100644 --- a/tests/exclude-redirected.test.js +++ b/tests/exclude-redirected.test.js @@ -6,7 +6,7 @@ import { execSync } from "child_process"; test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => { execSync( - "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1"); + "docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1"); // no entries besides header expect( @@ -19,3 +19,32 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i }); + +test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => { + execSync( + "docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX"); + + + // no entries besides header + expect( + fs + .readFileSync( + "test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl", + "utf8", + ).trim().split("\n").length + ).toBe(1); + + + const data = fs.readFileSync( + "test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj", + { encoding: "utf-8" }, + ); + + // expect one occurence + const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`); + expect(first > 0).toBe(true); + + // expect no other occurences + expect(data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`, first + 1)).toBe(-1); + +});