mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
15 lines
675 B
JavaScript
15 lines
675 B
JavaScript
import fs from "fs";
|
|
import child_process from "child_process";
|
|
|
|
test("ensure URLs with same query args but in different order considered same URL", async () => {
|
|
child_process.execSync("docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url 'https://example-com.webrecorder.net/?A=1&B=2' --url 'https://example-com.webrecorder.net/?B=2&A=1' --collection url-norm-1 --scopeType page");
|
|
|
|
// url is normalized, only 1 URL is crawled
|
|
// check pages.jsonl for 1 URL (+ 1 header)
|
|
expect(fs.readFileSync(
|
|
"test-crawls/collections/url-norm-1/pages/pages.jsonl", "utf8",
|
|
)
|
|
.trim()
|
|
.split("\n").length).toBe(1 + 1);
|
|
});
|
|
|