Add ad blocking via request interception (#173)

* ad blocking via request interception, extending block rules system, adding new AdBlockRules * Load list of hosts to block from https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts added as json on image build * Enabled via --blockAds and setting a custom message via --adBlockMessage * new test to check for ad blocking * Add test-crawls dir to .gitignore and .dockerignore
2025-10-19 06:23:16 +00:00 · 2022-10-25 10:53:32 -04:00 · 2022-10-25 10:53:32 -04:00 · e02058f001
commit e02058f001
parent 277314f2de
8 changed files with 123 additions and 4 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,3 +1,4 @@
 output/
 node_modules/
 crawls/
+test-crawls/
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@ __pycache__
 collections/
 node_modules/
 crawls/
+test-crawls/
--- a/9
+++ b/9
@ -3,6 +3,9 @@ ARG BROWSER_VERSION=105

 FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}

+# TODO: Move this into base image
+RUN apt-get update && apt-get install -y jq
+
 # needed to add args to main build stage
 ARG BROWSER_VERSION

@ -27,6 +30,12 @@ ADD package.json /app/
 # to allow forcing rebuilds from this stage
 ARG REBUILD

+# Download and format ad host blocklist as JSON
+RUN mkdir -p /tmp/ads && cd /tmp/ads && \
+    curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \
+    cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \
+    rm /tmp/ads/ad-hosts.txt
+
 RUN yarn install

 ADD *.js /app/
--- a/crawler.js
+++ b/crawler.js
@ -27,7 +27,7 @@ import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI }

 import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";

-import { BlockRules } from "./util/blockrules.js";
+import { AdBlockRules, BlockRules } from "./util/blockrules.js";

 // to ignore HTTPS error for HEAD check
 import { Agent as HTTPAgent } from "http";
@ -42,7 +42,6 @@ const HTTP_AGENT = HTTPAgent();
 const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});


-
 // ============================================================================
 export class Crawler {
  constructor() {
@ -99,6 +98,7 @@ export class Crawler {
    this.pagesFile = path.join(this.pagesDir, "pages.jsonl");

    this.blockRules = null;
+    this.adBlockRules = null;

    this.errorCount = 0;

@ -577,6 +577,10 @@ export class Crawler {

    await this.initPages();

+    if (this.params.blockAds) {
+      this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text));
+    }
+
    if (this.params.blockRules && this.params.blockRules.length) {
      this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
    }
@ -756,6 +760,10 @@ export class Crawler {
      }
    }

+    if (this.adBlockRules) {
+      await this.adBlockRules.initPage(page);
+    }
+
    if (this.blockRules) {
      await this.blockRules.initPage(page);
    }
--- a/tests/adblockrules.test.js
+++ b/tests/adblockrules.test.js
@ -0,0 +1,47 @@
+import child_process from "child_process";
+import fs from "fs";
+import yaml from "js-yaml";
+
+function runCrawl(name, config, commandExtra = "") {
+  config.generateCDX = true;
+  config.depth = 0;
+  config.collection = name;
+  
+  const configYaml = yaml.dump(config);
+
+  try {
+    const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
+
+    console.log(proc);
+  }
+  catch (error) {
+    console.log(error);
+  }
+}
+
+function doesCDXContain(coll, value) {
+  const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
+  return data.indexOf(value) >= 0;
+}
+
+test("test crawl without ad block for specific URL", () => {
+  const config = {
+    "url": "https://www.mozilla.org/en-US/firefox/",
+  };
+
+  runCrawl("adblock-no-block", config);
+
+  // without ad blocking, URL with googletagmanager is included
+  expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true);
+});
+
+test("testcrawl with ad block for specific URL", () => {
+  const config = {
+    "url": "https://www.mozilla.org/en-US/firefox/",
+    "blockAds": true,
+  };
+
+  runCrawl("adblock-block", config);
+
+  expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false);
+});
--- a/tests/extra_hops_depth.test.js
+++ b/tests/extra_hops_depth.test.js
@ -6,7 +6,6 @@ import {exec as execCallback } from "child_process";
 const exec = util.promisify(execCallback);


-
 test("check that URLs are crawled 2 extra hops beyond depth", async () => {
  try {
    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
--- a/util/argParser.js
+++ b/util/argParser.js
@ -111,6 +111,18 @@ class ArgParser {
        type: "string",
      },

+      "blockAds": {
+        alias: "blockads",
+        describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
+        type: "boolean",
+        default: false,
+      },
+
+      "adBlockMessage": {
+        describe: "If specified, when an ad is blocked, a record with this error message is added instead",
+        type: "string",
+      },
+
      "collection": {
        alias: "c",
        describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
--- a/util/blockrules.js
+++ b/util/blockrules.js
@ -1,3 +1,5 @@
+import fs from "fs";
+
 const RULE_TYPES = ["block", "allowOnly"];

 const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
@ -6,7 +8,8 @@ const BlockState = {
  ALLOW: null,
  BLOCK_PAGE_NAV: "page",
  BLOCK_IFRAME_NAV: "iframe",
-  BLOCK_OTHER: "resource"
+  BLOCK_OTHER: "resource",
+  BLOCK_AD: "advertisement"
 };


@ -222,3 +225,42 @@ export class BlockRules
    await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
  }
 }
+
+
+// ===========================================================================
+export class AdBlockRules extends BlockRules
+{
+  constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") {
+    super([], blockPutUrl, blockErrMsg, debugLog);
+    this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
+  }
+
+  async initPage(page) {
+    if (page._btrix_adInterceptionAdded) {
+      return true;
+    }
+
+    page._btrix_adInterceptionAdded = true;
+
+    await page.setRequestInterception(true);
+
+    page.on("request", async (request) => {
+      try {
+        await this.handleRequest(request);
+      } catch (e) {
+        console.warn(e);
+      }
+    });
+  }
+
+  async shouldBlock(request, url) {
+    const fragments = url.split("/");
+    const domain = fragments.length > 2 ? fragments[2] : null;
+    if (this.adhosts.includes(domain)) {
+      this.debugLog(`URL blocked for being an ad: ${url}`);
+      await this.recordBlockMsg(url);
+      return BlockState.BLOCK_AD;
+    }
+    return BlockState.ALLOW;
+  }
+}