mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add ad blocking via request interception (#173)
* ad blocking via request interception, extending block rules system, adding new AdBlockRules * Load list of hosts to block from https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts added as json on image build * Enabled via --blockAds and setting a custom message via --adBlockMessage * new test to check for ad blocking * Add test-crawls dir to .gitignore and .dockerignore
This commit is contained in:
parent
277314f2de
commit
e02058f001
8 changed files with 123 additions and 4 deletions
|
@ -1,3 +1,4 @@
|
|||
output/
|
||||
node_modules/
|
||||
crawls/
|
||||
test-crawls/
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -4,3 +4,4 @@ __pycache__
|
|||
collections/
|
||||
node_modules/
|
||||
crawls/
|
||||
test-crawls/
|
||||
|
|
|
@ -3,6 +3,9 @@ ARG BROWSER_VERSION=105
|
|||
|
||||
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}
|
||||
|
||||
# TODO: Move this into base image
|
||||
RUN apt-get update && apt-get install -y jq
|
||||
|
||||
# needed to add args to main build stage
|
||||
ARG BROWSER_VERSION
|
||||
|
||||
|
@ -27,6 +30,12 @@ ADD package.json /app/
|
|||
# to allow forcing rebuilds from this stage
|
||||
ARG REBUILD
|
||||
|
||||
# Download and format ad host blocklist as JSON
|
||||
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
|
||||
curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \
|
||||
cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \
|
||||
rm /tmp/ads/ad-hosts.txt
|
||||
|
||||
RUN yarn install
|
||||
|
||||
ADD *.js /app/
|
||||
|
|
12
crawler.js
12
crawler.js
|
@ -27,7 +27,7 @@ import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI }
|
|||
|
||||
import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
|
||||
|
||||
import { BlockRules } from "./util/blockrules.js";
|
||||
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
||||
|
||||
// to ignore HTTPS error for HEAD check
|
||||
import { Agent as HTTPAgent } from "http";
|
||||
|
@ -42,7 +42,6 @@ const HTTP_AGENT = HTTPAgent();
|
|||
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
||||
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class Crawler {
|
||||
constructor() {
|
||||
|
@ -99,6 +98,7 @@ export class Crawler {
|
|||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
this.blockRules = null;
|
||||
this.adBlockRules = null;
|
||||
|
||||
this.errorCount = 0;
|
||||
|
||||
|
@ -577,6 +577,10 @@ export class Crawler {
|
|||
|
||||
await this.initPages();
|
||||
|
||||
if (this.params.blockAds) {
|
||||
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text));
|
||||
}
|
||||
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
|
||||
}
|
||||
|
@ -756,6 +760,10 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
if (this.adBlockRules) {
|
||||
await this.adBlockRules.initPage(page);
|
||||
}
|
||||
|
||||
if (this.blockRules) {
|
||||
await this.blockRules.initPage(page);
|
||||
}
|
||||
|
|
47
tests/adblockrules.test.js
Normal file
47
tests/adblockrules.test.js
Normal file
|
@ -0,0 +1,47 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
import yaml from "js-yaml";
|
||||
|
||||
function runCrawl(name, config, commandExtra = "") {
|
||||
config.generateCDX = true;
|
||||
config.depth = 0;
|
||||
config.collection = name;
|
||||
|
||||
const configYaml = yaml.dump(config);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
function doesCDXContain(coll, value) {
|
||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
||||
return data.indexOf(value) >= 0;
|
||||
}
|
||||
|
||||
test("test crawl without ad block for specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.mozilla.org/en-US/firefox/",
|
||||
};
|
||||
|
||||
runCrawl("adblock-no-block", config);
|
||||
|
||||
// without ad blocking, URL with googletagmanager is included
|
||||
expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true);
|
||||
});
|
||||
|
||||
test("testcrawl with ad block for specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.mozilla.org/en-US/firefox/",
|
||||
"blockAds": true,
|
||||
};
|
||||
|
||||
runCrawl("adblock-block", config);
|
||||
|
||||
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false);
|
||||
});
|
|
@ -6,7 +6,6 @@ import {exec as execCallback } from "child_process";
|
|||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
||||
|
||||
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
|
||||
|
|
|
@ -111,6 +111,18 @@ class ArgParser {
|
|||
type: "string",
|
||||
},
|
||||
|
||||
"blockAds": {
|
||||
alias: "blockads",
|
||||
describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"adBlockMessage": {
|
||||
describe: "If specified, when an ad is blocked, a record with this error message is added instead",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"collection": {
|
||||
alias: "c",
|
||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import fs from "fs";
|
||||
|
||||
const RULE_TYPES = ["block", "allowOnly"];
|
||||
|
||||
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
||||
|
@ -6,7 +8,8 @@ const BlockState = {
|
|||
ALLOW: null,
|
||||
BLOCK_PAGE_NAV: "page",
|
||||
BLOCK_IFRAME_NAV: "iframe",
|
||||
BLOCK_OTHER: "resource"
|
||||
BLOCK_OTHER: "resource",
|
||||
BLOCK_AD: "advertisement"
|
||||
};
|
||||
|
||||
|
||||
|
@ -222,3 +225,42 @@ export class BlockRules
|
|||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class AdBlockRules extends BlockRules
|
||||
{
|
||||
constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") {
|
||||
super([], blockPutUrl, blockErrMsg, debugLog);
|
||||
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
|
||||
}
|
||||
|
||||
async initPage(page) {
|
||||
if (page._btrix_adInterceptionAdded) {
|
||||
return true;
|
||||
}
|
||||
|
||||
page._btrix_adInterceptionAdded = true;
|
||||
|
||||
await page.setRequestInterception(true);
|
||||
|
||||
page.on("request", async (request) => {
|
||||
try {
|
||||
await this.handleRequest(request);
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async shouldBlock(request, url) {
|
||||
const fragments = url.split("/");
|
||||
const domain = fragments.length > 2 ? fragments[2] : null;
|
||||
if (this.adhosts.includes(domain)) {
|
||||
this.debugLog(`URL blocked for being an ad: ${url}`);
|
||||
await this.recordBlockMsg(url);
|
||||
return BlockState.BLOCK_AD;
|
||||
}
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue