diff --git a/.dockerignore b/.dockerignore index cbb28a2c..0d01f780 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,4 @@ output/ node_modules/ crawls/ +test-crawls/ diff --git a/.gitignore b/.gitignore index f698de63..dfa1820f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__ collections/ node_modules/ crawls/ +test-crawls/ diff --git a/Dockerfile b/Dockerfile index 445c1f44..733532a1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,9 @@ ARG BROWSER_VERSION=105 FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} +# TODO: Move this into base image +RUN apt-get update && apt-get install -y jq + # needed to add args to main build stage ARG BROWSER_VERSION @@ -27,6 +30,12 @@ ADD package.json /app/ # to allow forcing rebuilds from this stage ARG REBUILD +# Download and format ad host blocklist as JSON +RUN mkdir -p /tmp/ads && cd /tmp/ads && \ + curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \ + cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \ + rm /tmp/ads/ad-hosts.txt + RUN yarn install ADD *.js /app/ diff --git a/crawler.js b/crawler.js index 3acda46f..2c6ede86 100644 --- a/crawler.js +++ b/crawler.js @@ -27,7 +27,7 @@ import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js"; -import { BlockRules } from "./util/blockrules.js"; +import { AdBlockRules, BlockRules } from "./util/blockrules.js"; // to ignore HTTPS error for HEAD check import { Agent as HTTPAgent } from "http"; @@ -42,7 +42,6 @@ const HTTP_AGENT = HTTPAgent(); const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"}); - // ============================================================================ export class Crawler { constructor() { @@ -99,6 +98,7 @@ export class Crawler { this.pagesFile = path.join(this.pagesDir, "pages.jsonl"); this.blockRules = null; + this.adBlockRules = null; this.errorCount = 0; @@ -577,6 +577,10 @@ export class Crawler { await this.initPages(); + if (this.params.blockAds) { + this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text)); + } + if (this.params.blockRules && this.params.blockRules.length) { this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text)); } @@ -756,6 +760,10 @@ export class Crawler { } } + if (this.adBlockRules) { + await this.adBlockRules.initPage(page); + } + if (this.blockRules) { await this.blockRules.initPage(page); } diff --git a/tests/adblockrules.test.js b/tests/adblockrules.test.js new file mode 100644 index 00000000..56ed8d92 --- /dev/null +++ b/tests/adblockrules.test.js @@ -0,0 +1,47 @@ +import child_process from "child_process"; +import fs from "fs"; +import yaml from "js-yaml"; + +function runCrawl(name, config, commandExtra = "") { + config.generateCDX = true; + config.depth = 0; + config.collection = name; + + const configYaml = yaml.dump(config); + + try { + const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"}); + + console.log(proc); + } + catch (error) { + console.log(error); + } +} + +function doesCDXContain(coll, value) { + const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`); + return data.indexOf(value) >= 0; +} + +test("test crawl without ad block for specific URL", () => { + const config = { + "url": "https://www.mozilla.org/en-US/firefox/", + }; + + runCrawl("adblock-no-block", config); + + // without ad blocking, URL with googletagmanager is included + expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true); +}); + +test("testcrawl with ad block for specific URL", () => { + const config = { + "url": "https://www.mozilla.org/en-US/firefox/", + "blockAds": true, + }; + + runCrawl("adblock-block", config); + + expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false); +}); diff --git a/tests/extra_hops_depth.test.js b/tests/extra_hops_depth.test.js index c9aff05a..772bb434 100644 --- a/tests/extra_hops_depth.test.js +++ b/tests/extra_hops_depth.test.js @@ -6,7 +6,6 @@ import {exec as execCallback } from "child_process"; const exec = util.promisify(execCallback); - test("check that URLs are crawled 2 extra hops beyond depth", async () => { try { await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7"); diff --git a/util/argParser.js b/util/argParser.js index 71e8b8ed..250fa716 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -111,6 +111,18 @@ class ArgParser { type: "string", }, + "blockAds": { + alias: "blockads", + describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)", + type: "boolean", + default: false, + }, + + "adBlockMessage": { + describe: "If specified, when an ad is blocked, a record with this error message is added instead", + type: "string", + }, + "collection": { alias: "c", describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)", diff --git a/util/blockrules.js b/util/blockrules.js index c793374a..b8ed27cd 100644 --- a/util/blockrules.js +++ b/util/blockrules.js @@ -1,3 +1,5 @@ +import fs from "fs"; + const RULE_TYPES = ["block", "allowOnly"]; const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"]; @@ -6,7 +8,8 @@ const BlockState = { ALLOW: null, BLOCK_PAGE_NAV: "page", BLOCK_IFRAME_NAV: "iframe", - BLOCK_OTHER: "resource" + BLOCK_OTHER: "resource", + BLOCK_AD: "advertisement" }; @@ -222,3 +225,42 @@ export class BlockRules await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body}); } } + + +// =========================================================================== +export class AdBlockRules extends BlockRules +{ + constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") { + super([], blockPutUrl, blockErrMsg, debugLog); + this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url))); + } + + async initPage(page) { + if (page._btrix_adInterceptionAdded) { + return true; + } + + page._btrix_adInterceptionAdded = true; + + await page.setRequestInterception(true); + + page.on("request", async (request) => { + try { + await this.handleRequest(request); + } catch (e) { + console.warn(e); + } + }); + } + + async shouldBlock(request, url) { + const fragments = url.split("/"); + const domain = fragments.length > 2 ? fragments[2] : null; + if (this.adhosts.includes(domain)) { + this.debugLog(`URL blocked for being an ad: ${url}`); + await this.recordBlockMsg(url); + return BlockState.BLOCK_AD; + } + return BlockState.ALLOW; + } +}