Add ad blocking via request interception (#173)

* ad blocking via request interception, extending block rules system, adding new AdBlockRules
* Load list of hosts to block from https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts added as json on image build
* Enabled via --blockAds and setting a custom message via --adBlockMessage
* new test to check for ad blocking
* Add test-crawls dir to .gitignore and .dockerignore
This commit is contained in:
Tessa Walsh 2022-10-25 10:53:32 -04:00 committed by Ilya Kreymer
parent 277314f2de
commit e02058f001
8 changed files with 123 additions and 4 deletions

View file

@ -1,3 +1,4 @@
output/
node_modules/
crawls/
test-crawls/

1
.gitignore vendored
View file

@ -4,3 +4,4 @@ __pycache__
collections/
node_modules/
crawls/
test-crawls/

View file

@ -3,6 +3,9 @@ ARG BROWSER_VERSION=105
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}
# TODO: Move this into base image
RUN apt-get update && apt-get install -y jq
# needed to add args to main build stage
ARG BROWSER_VERSION
@ -27,6 +30,12 @@ ADD package.json /app/
# to allow forcing rebuilds from this stage
ARG REBUILD
# Download and format ad host blocklist as JSON
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \
cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \
rm /tmp/ads/ad-hosts.txt
RUN yarn install
ADD *.js /app/

View file

@ -27,7 +27,7 @@ import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI }
import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
import { BlockRules } from "./util/blockrules.js";
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
@ -42,7 +42,6 @@ const HTTP_AGENT = HTTPAgent();
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
// ============================================================================
export class Crawler {
constructor() {
@ -99,6 +98,7 @@ export class Crawler {
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
this.blockRules = null;
this.adBlockRules = null;
this.errorCount = 0;
@ -577,6 +577,10 @@ export class Crawler {
await this.initPages();
if (this.params.blockAds) {
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text));
}
if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
}
@ -756,6 +760,10 @@ export class Crawler {
}
}
if (this.adBlockRules) {
await this.adBlockRules.initPage(page);
}
if (this.blockRules) {
await this.blockRules.initPage(page);
}

View file

@ -0,0 +1,47 @@
import child_process from "child_process";
import fs from "fs";
import yaml from "js-yaml";
function runCrawl(name, config, commandExtra = "") {
config.generateCDX = true;
config.depth = 0;
config.collection = name;
const configYaml = yaml.dump(config);
try {
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
console.log(proc);
}
catch (error) {
console.log(error);
}
}
function doesCDXContain(coll, value) {
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
return data.indexOf(value) >= 0;
}
test("test crawl without ad block for specific URL", () => {
const config = {
"url": "https://www.mozilla.org/en-US/firefox/",
};
runCrawl("adblock-no-block", config);
// without ad blocking, URL with googletagmanager is included
expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true);
});
test("testcrawl with ad block for specific URL", () => {
const config = {
"url": "https://www.mozilla.org/en-US/firefox/",
"blockAds": true,
};
runCrawl("adblock-block", config);
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false);
});

View file

@ -6,7 +6,6 @@ import {exec as execCallback } from "child_process";
const exec = util.promisify(execCallback);
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");

View file

@ -111,6 +111,18 @@ class ArgParser {
type: "string",
},
"blockAds": {
alias: "blockads",
describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
type: "boolean",
default: false,
},
"adBlockMessage": {
describe: "If specified, when an ad is blocked, a record with this error message is added instead",
type: "string",
},
"collection": {
alias: "c",
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",

View file

@ -1,3 +1,5 @@
import fs from "fs";
const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
@ -6,7 +8,8 @@ const BlockState = {
ALLOW: null,
BLOCK_PAGE_NAV: "page",
BLOCK_IFRAME_NAV: "iframe",
BLOCK_OTHER: "resource"
BLOCK_OTHER: "resource",
BLOCK_AD: "advertisement"
};
@ -222,3 +225,42 @@ export class BlockRules
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}
// ===========================================================================
export class AdBlockRules extends BlockRules
{
constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg, debugLog);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
}
async initPage(page) {
if (page._btrix_adInterceptionAdded) {
return true;
}
page._btrix_adInterceptionAdded = true;
await page.setRequestInterception(true);
page.on("request", async (request) => {
try {
await this.handleRequest(request);
} catch (e) {
console.warn(e);
}
});
}
async shouldBlock(request, url) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) {
this.debugLog(`URL blocked for being an ad: ${url}`);
await this.recordBlockMsg(url);
return BlockState.BLOCK_AD;
}
return BlockState.ALLOW;
}
}