mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

It's possible for a redirect, especially a browser-generated one to have headers and no body (eg. Brave removing tracking url query). Don't filter these redirects out from being written to WARC, just set payload to empty buffer. fixes #627 where Brave-generated redirect response was not stored.
37 lines
1.1 KiB
JavaScript
37 lines
1.1 KiB
JavaScript
import fs from "fs";
|
|
import { execSync } from "child_process";
|
|
|
|
test("check that gclid query URL is automatically redirected to remove it", async () => {
|
|
try {
|
|
execSync(
|
|
"docker run --rm -v $PWD/test-crawls:/crawls -i webrecorder/browsertrix-crawler crawl --url 'https://webrecorder.net/about?gclid=abc' --collection test-brave-redir --behaviors \"\" --limit 1 --generateCDX");
|
|
|
|
} catch (error) {
|
|
console.log(error.stderr);
|
|
}
|
|
|
|
const filedata = fs.readFileSync(
|
|
"test-crawls/collections/test-brave-redir/indexes/index.cdxj",
|
|
{ encoding: "utf-8" },
|
|
);
|
|
|
|
let responseFound = false;
|
|
let redirectFound = false;
|
|
|
|
const lines = filedata.trim().split("\n");
|
|
|
|
for (const line of lines) {
|
|
const json = line.split(" ").slice(2).join(" ");
|
|
const data = JSON.parse(json);
|
|
if (data.url === "https://webrecorder.net/about?gclid=abc" && data.status === "307") {
|
|
redirectFound = true;
|
|
} else if (data.url === "https://webrecorder.net/about" && data.status === "200") {
|
|
responseFound = true;
|
|
}
|
|
if (responseFound && redirectFound) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
expect(redirectFound && responseFound).toBe(true);
|
|
});
|