Support for per-URL conditional Block Rules (#68)

- Support for block rules specified in YAML config to exclude URLs based on regex, and also negate a rule by specifying `allowOnly` to allow URLs based on certain regex.
- Support for conditional blocking for iframes, based on content of iframe text, specified via frameTextMatch regex.
- Support for restricting block rules based on containing frame URL, specified via inFrameURL param.
- Testing for various blockRules configurations
- Fixes Support URL-level WARC-writing inclusion/exclusion lists #15
- optional message to add when a URL is blocked, specified via 'blockMessage'
- update README for blockRules
- bump to pywb dependency 2.5.0b4
This commit is contained in:
Ilya Kreymer 2021-07-19 15:49:43 -07:00 committed by Ilya Kreymer
parent 838e1fa1bd
commit 6dbdff9656
8 changed files with 363 additions and 11 deletions

View file

@ -11,7 +11,7 @@ FROM ubuntu:bionic
RUN apt-get update -y && apt-get install --no-install-recommends -qqy software-properties-common \
&& add-apt-repository -y ppa:deadsnakes \
&& apt-get update -y \
&& apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git \
&& apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git socat \
python3.8 python3.8-distutils python3.8-dev gpg ca-certificates \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

View file

@ -47,9 +47,6 @@ Browsertrix Crawler includes a number of additional command-line options, explai
<summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary>
```
crawler [options]
Options:
--help Show help [boolean]
--version Show version number [boolean]
--seeds, --url The URL to start crawling from
@ -87,10 +84,18 @@ Options:
single-page-application crawling or
when different hashtags load dynamic
content
--blockRules Additional rules for blocking
certain URLs from being loaded, by
URL regex and optionally via text
match in an iframe
[array] [default: []]
--blockMessage If specified, when a URL is blocked,
a record with this error message is
added instead [string]
-c, --collection Collection name to crawl to (replay
will be accessible under this name
in pywb preview)
[string] [default: "capture-2021-07-01T23-43-26"]
[string] [default: "capture-YYYY-MM-DDThh:mm:ss"]
--headless Run in headless mode, otherwise
start xvfb[boolean] [default: false]
--driver JS driver for the crawler
@ -140,10 +145,9 @@ Options:
an HTTP server with screencast
accessible on this port
[number] [default: 0]
--warcInfo, --warcinfo Optional fields added to the
warcinfo record in combined WARCs
--config Path to YAML config file
--warcinfo Optional fields added to the warcinfo
record in combined WARCs
```
</details>
@ -226,6 +230,34 @@ The available types are:
The `depth` setting also limits how many pages will be crawled for that seed, while the `limit` option sets the total
number of pages crawled from any seed.
### Block Rules
While scope rules define which pages are to be crawled, it is also possible to block certain URLs in certain pages or frames from being recorded.
This is useful for blocking ads or other content that should not be included.
The block rules can be specified as a list in the `blockRules` field. Each rule can contain one of the following fields:
- `url`: regex for URL to match (required)
- `type`: can be `block` or `allowOnly`. The block rule blocks the specified match, while allowOnly inverts the match and allows only the matched URLs, while blocking all others.
- `inFrameUrl`: if specified, indicates that the rule only applies when `url` is loaded in a specific iframe or top-level frame.
- `frameTextMatch`: if specified, the text of the specified URL is checked for the regex, and the rule applies only if there is an additional match. When specified, this field makes the block rule apply only to frame-level resource, eg. URLs loaded directly in an iframe or top-level frame.
For example, a very simple block rule that blocks all URLs from 'googleanalytics.com' can be added with:
```
blockRules:
- url: googleanalytics.com
```
For additional examples of block rules, see the [tests/blockrules.test.js](tests/blockrules.test.js) file in the test suite.
If the `--blockMessage` is also specified, a blocked URL is replaced with the specified message (added as a WARC resource record).
### Custom Warcinfo Fields
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.

View file

@ -13,6 +13,7 @@ recorder:
source_coll: live
cache: always
rollover_size: ${ROLLOVER_SIZE}
enable_put_custom_record: true
#autoindex: 10

View file

@ -29,6 +29,9 @@ const { parseArgs } = require("./util/argParser");
const { BROWSER_BIN, BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
const { BlockRules } = require("./util/blockrules");
// ============================================================================
class Crawler {
constructor() {
@ -55,7 +58,8 @@ class Crawler {
console.log("Seeds", this.params.scopedSeeds);
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
this.capturePrefix = this.captureBasePrerix + "/id_/";
this.gotoOpts = {
waitUntil: this.params.waitUntil,
@ -70,6 +74,9 @@ class Crawler {
// pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
this.blockRules = null;
}
configureUA() {
@ -298,6 +305,10 @@ class Crawler {
await this.initPages();
if (this.params.blockRules) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
}
if (this.params.screencastPort) {
this.screencaster = new ScreenCaster(this.cluster, this.params.screencastPort);
}
@ -378,6 +389,10 @@ class Crawler {
return;
}
if (this.blockRules) {
await this.blockRules.initPage(page);
}
try {
await page.goto(url, this.gotoOpts);
} catch (e) {

View file

@ -1,4 +1,4 @@
pywb>=2.6.0b3
#git+https://github.com/webrecorder/pywb@main
pywb>=2.6.0b4
git+https://github.com/webrecorder/pywb@main
uwsgi
wacz>=0.3.0

134
tests/blockrules.test.js Normal file
View file

@ -0,0 +1,134 @@
const yaml = require("js-yaml");
const child_process = require("child_process");
const fs = require("fs");
function runCrawl(name, config, commandExtra = "") {
config.generateCDX = true;
config.depth = 0;
config.collection = name;
const configYaml = yaml.dump(config);
try {
const version = require("../package.json").version;
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
console.log(proc);
}
catch (error) {
console.log(error);
}
}
function doesCDXContain(coll, value) {
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
return data.indexOf(value) >= 0;
}
test("test crawl without block for specific URL", () => {
const config = {
"url": "https://www.iana.org/",
};
runCrawl("block-1-no-block", config);
// without blocks, URL with add sense is included
expect(doesCDXContain("block-1-no-block", "https://cse.google.com/adsense/search/async-ads.js")).toBe(true);
});
test("test block rule on specific URL", () => {
const config = {
"url": "https://www.iana.org/",
"blockRules": [
{"url": "adsense"}
]
};
runCrawl("block-1", config);
expect(doesCDXContain("block-1", "https://cse.google.com/adsense/search/async-ads.js")).toBe(false);
});
test("test block rule based on iframe text, content included due to match", () => {
const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{
"url": "https://www.youtube.com/embed/",
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
"type": "allowOnly"
}]
};
runCrawl("block-2", config);
expect(doesCDXContain("block-2", "\"video/mp4\"")).toBe(true);
});
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{
"url": "https://www.youtube.com/embed/",
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\\"",
"type": "allowOnly"
}]
};
runCrawl("block-3", config);
expect(doesCDXContain("block-3", "\"video/mp4\"")).toBe(false);
});
test("test block rule based on iframe text, block matched", () => {
const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{
"url": "https://www.youtube.com/embed/",
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
}]
};
runCrawl("block-4", config);
expect(doesCDXContain("block-4", "\"video/mp4\"")).toBe(false);
});
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{
"url": "example.com/embed/",
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
"type": "block"
}, {
"url": "(youtube.com|example.com)/embed/",
"type": "allowOnly",
"inFrameUrl": "oembed.link/",
}]
};
runCrawl("non-block-5", config);
expect(doesCDXContain("non-block-5", "\"video/mp4\"")).toBe(true);
});
test("test block url in frame url", () => {
const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{
"url": "maxresdefault.jpg",
"type": "block",
"inFrameUrl": "youtube.com/embed",
}]
};
runCrawl("block-6", config);
expect(doesCDXContain("block-6", "\"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg\"")).toBe(false);
});

View file

@ -90,6 +90,17 @@ class ArgParser {
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
},
"blockRules": {
describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
type: "array",
default: [],
},
"blockMessage": {
describe: "If specified, when a URL is blocked, a record with this error message is added instead",
type: "string",
},
"collection": {
alias: "c",
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",

159
util/blockrules.js Normal file
View file

@ -0,0 +1,159 @@
const fetch = require("node-fetch");
const RULE_TYPES = ["block", "allowOnly"];
// ===========================================================================
class BlockRule
{
constructor(data) {
if (typeof(data) === "string") {
this.url = new RegExp(data);
this.type = "block";
} else {
this.url = data.url ? new RegExp(data.url) : null;
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
this.type = data.type || "block";
}
if (!RULE_TYPES.includes(this.type)) {
throw new Error("Rule \"type\" must be: " + RULE_TYPES.join(", "));
}
}
toString() {
return `\
* Rule for URL Regex: ${this.url}
Type: ${this.type}
In Frame Regex: ${this.inFrameUrl ? this.inFrameUrl : "any"}
Resource Type: ${this.frameTextMatch ? "frame" : "any"}
${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
`;
}
}
// ===========================================================================
class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.putUrlSet = new Set();
for (const ruleData of blockRules) {
this.rules.push(new BlockRule(ruleData));
}
console.log("URL Block Rules:\n");
for (const rule of this.rules) {
console.log(rule.toString());
}
}
async initPage(page) {
await page.setRequestInterception(true);
page.on("request", (request) => this.handleRequest(request));
}
async handleRequest(request) {
const url = request.url();
if (!url.startsWith("http:") && !url.startsWith("https:")) {
request.continue();
return;
}
for (const rule of this.rules) {
const {done, block} = await this.shouldBlock(rule, request);
if (block) {
//const frameUrl = request.frame().url();
//console.log("Blocking/Aborting Request for: " + request.url());
// not allowed, abort loading this response
request.abort();
await this.recordBlockMsg(request.url());
return;
}
if (done) {
break;
}
}
request.continue();
}
async shouldBlock(rule, request) {
const reqUrl = request.url();
const {url, inFrameUrl, frameTextMatch} = rule;
const type = rule.type || "block";
const allowOnly = (type === "allowOnly");
const frameUrl = request.frame().url();
// ignore initial page
if (frameUrl === "about:blank") {
return {block: false, done: true};
}
// not a frame match, skip rule
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
return {block: false, done: false};
}
const urlMatched = (url && reqUrl.match(url));
// if frame text-based rule: if url matched and a frame request
// frame text-based match: only applies to nav requests, never block otherwise
if (frameTextMatch) {
if (!urlMatched || !request.isNavigationRequest()) {
return {block: false, done: false};
}
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
return {block, done: true};
}
// for non frame text rule, simply match by URL
const block = urlMatched ? !allowOnly : allowOnly;
return {block, done: false};
}
async isTextMatch(request, reqUrl, frameTextMatch) {
try {
const res = await fetch(reqUrl);
const text = await res.text();
return !!text.match(frameTextMatch);
} catch (e) {
console.log(e);
}
}
async recordBlockMsg(url) {
if (!this.blockErrMsg || !this.blockPutUrl) {
return;
}
if (this.putUrlSet.has(url)) {
return;
}
this.putUrlSet.add(url);
const body = this.blockErrMsg;
const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url);
console.log("put url", putUrl.href);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}
module.exports.BlockRules = BlockRules;