browsertrix-crawler/tests/brave-query-redir.test.js
Ilya Kreymer 6a9ca3df54
Don't filter saving redirect if no response body. (#628)
It's possible for a redirect, especially a browser-generated one to have
headers and no body (eg. Brave removing tracking url query). Don't
filter these redirects out from being written to WARC, just set payload to empty
buffer.

fixes #627 where Brave-generated redirect response was not stored.
2024-06-25 15:48:22 -07:00

37 lines
1.1 KiB
JavaScript

import fs from "fs";
import { execSync } from "child_process";
test("check that gclid query URL is automatically redirected to remove it", async () => {
try {
execSync(
"docker run --rm -v $PWD/test-crawls:/crawls -i webrecorder/browsertrix-crawler crawl --url 'https://webrecorder.net/about?gclid=abc' --collection test-brave-redir --behaviors \"\" --limit 1 --generateCDX");
} catch (error) {
console.log(error.stderr);
}
const filedata = fs.readFileSync(
"test-crawls/collections/test-brave-redir/indexes/index.cdxj",
{ encoding: "utf-8" },
);
let responseFound = false;
let redirectFound = false;
const lines = filedata.trim().split("\n");
for (const line of lines) {
const json = line.split(" ").slice(2).join(" ");
const data = JSON.parse(json);
if (data.url === "https://webrecorder.net/about?gclid=abc" && data.status === "307") {
redirectFound = true;
} else if (data.url === "https://webrecorder.net/about" && data.status === "200") {
responseFound = true;
}
if (responseFound && redirectFound) {
break;
}
}
expect(redirectFound && responseFound).toBe(true);
});