browsertrix-crawler/tests/adblockrules.test.js
Tessa Walsh e02058f001 Add ad blocking via request interception (#173)
* ad blocking via request interception, extending block rules system, adding new AdBlockRules
* Load list of hosts to block from https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts added as json on image build
* Enabled via --blockAds and setting a custom message via --adBlockMessage
* new test to check for ad blocking
* Add test-crawls dir to .gitignore and .dockerignore
2022-11-15 18:30:27 -08:00

47 lines
1.3 KiB
JavaScript

import child_process from "child_process";
import fs from "fs";
import yaml from "js-yaml";
function runCrawl(name, config, commandExtra = "") {
config.generateCDX = true;
config.depth = 0;
config.collection = name;
const configYaml = yaml.dump(config);
try {
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
console.log(proc);
}
catch (error) {
console.log(error);
}
}
function doesCDXContain(coll, value) {
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
return data.indexOf(value) >= 0;
}
test("test crawl without ad block for specific URL", () => {
const config = {
"url": "https://www.mozilla.org/en-US/firefox/",
};
runCrawl("adblock-no-block", config);
// without ad blocking, URL with googletagmanager is included
expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true);
});
test("testcrawl with ad block for specific URL", () => {
const config = {
"url": "https://www.mozilla.org/en-US/firefox/",
"blockAds": true,
};
runCrawl("adblock-block", config);
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false);
});