mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Support for per-URL conditional Block Rules (#68)
- Support for block rules specified in YAML config to exclude URLs based on regex, and also negate a rule by specifying `allowOnly` to allow URLs based on certain regex. - Support for conditional blocking for iframes, based on content of iframe text, specified via frameTextMatch regex. - Support for restricting block rules based on containing frame URL, specified via inFrameURL param. - Testing for various blockRules configurations - Fixes Support URL-level WARC-writing inclusion/exclusion lists #15 - optional message to add when a URL is blocked, specified via 'blockMessage' - update README for blockRules - bump to pywb dependency 2.5.0b4
This commit is contained in:
parent
838e1fa1bd
commit
6dbdff9656
8 changed files with 363 additions and 11 deletions
|
@ -11,7 +11,7 @@ FROM ubuntu:bionic
|
|||
RUN apt-get update -y && apt-get install --no-install-recommends -qqy software-properties-common \
|
||||
&& add-apt-repository -y ppa:deadsnakes \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git \
|
||||
&& apt-get install --no-install-recommends -qqy build-essential fonts-stix locales-all redis-server xvfb gpg-agent curl git socat \
|
||||
python3.8 python3.8-distutils python3.8-dev gpg ca-certificates \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
|
46
README.md
46
README.md
|
@ -47,9 +47,6 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
<summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary>
|
||||
|
||||
```
|
||||
crawler [options]
|
||||
|
||||
Options:
|
||||
--help Show help [boolean]
|
||||
--version Show version number [boolean]
|
||||
--seeds, --url The URL to start crawling from
|
||||
|
@ -87,10 +84,18 @@ Options:
|
|||
single-page-application crawling or
|
||||
when different hashtags load dynamic
|
||||
content
|
||||
--blockRules Additional rules for blocking
|
||||
certain URLs from being loaded, by
|
||||
URL regex and optionally via text
|
||||
match in an iframe
|
||||
[array] [default: []]
|
||||
--blockMessage If specified, when a URL is blocked,
|
||||
a record with this error message is
|
||||
added instead [string]
|
||||
-c, --collection Collection name to crawl to (replay
|
||||
will be accessible under this name
|
||||
in pywb preview)
|
||||
[string] [default: "capture-2021-07-01T23-43-26"]
|
||||
[string] [default: "capture-YYYY-MM-DDThh:mm:ss"]
|
||||
--headless Run in headless mode, otherwise
|
||||
start xvfb[boolean] [default: false]
|
||||
--driver JS driver for the crawler
|
||||
|
@ -140,10 +145,9 @@ Options:
|
|||
an HTTP server with screencast
|
||||
accessible on this port
|
||||
[number] [default: 0]
|
||||
--warcInfo, --warcinfo Optional fields added to the
|
||||
warcinfo record in combined WARCs
|
||||
--config Path to YAML config file
|
||||
--warcinfo Optional fields added to the warcinfo
|
||||
record in combined WARCs
|
||||
|
||||
```
|
||||
</details>
|
||||
|
||||
|
@ -226,6 +230,34 @@ The available types are:
|
|||
The `depth` setting also limits how many pages will be crawled for that seed, while the `limit` option sets the total
|
||||
number of pages crawled from any seed.
|
||||
|
||||
### Block Rules
|
||||
|
||||
While scope rules define which pages are to be crawled, it is also possible to block certain URLs in certain pages or frames from being recorded.
|
||||
|
||||
This is useful for blocking ads or other content that should not be included.
|
||||
|
||||
The block rules can be specified as a list in the `blockRules` field. Each rule can contain one of the following fields:
|
||||
|
||||
- `url`: regex for URL to match (required)
|
||||
|
||||
- `type`: can be `block` or `allowOnly`. The block rule blocks the specified match, while allowOnly inverts the match and allows only the matched URLs, while blocking all others.
|
||||
|
||||
- `inFrameUrl`: if specified, indicates that the rule only applies when `url` is loaded in a specific iframe or top-level frame.
|
||||
|
||||
- `frameTextMatch`: if specified, the text of the specified URL is checked for the regex, and the rule applies only if there is an additional match. When specified, this field makes the block rule apply only to frame-level resource, eg. URLs loaded directly in an iframe or top-level frame.
|
||||
|
||||
For example, a very simple block rule that blocks all URLs from 'googleanalytics.com' can be added with:
|
||||
|
||||
```
|
||||
blockRules:
|
||||
- url: googleanalytics.com
|
||||
```
|
||||
|
||||
For additional examples of block rules, see the [tests/blockrules.test.js](tests/blockrules.test.js) file in the test suite.
|
||||
|
||||
If the `--blockMessage` is also specified, a blocked URL is replaced with the specified message (added as a WARC resource record).
|
||||
|
||||
|
||||
### Custom Warcinfo Fields
|
||||
|
||||
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.
|
||||
|
|
|
@ -13,6 +13,7 @@ recorder:
|
|||
source_coll: live
|
||||
cache: always
|
||||
rollover_size: ${ROLLOVER_SIZE}
|
||||
enable_put_custom_record: true
|
||||
|
||||
#autoindex: 10
|
||||
|
||||
|
|
17
crawler.js
17
crawler.js
|
@ -29,6 +29,9 @@ const { parseArgs } = require("./util/argParser");
|
|||
|
||||
const { BROWSER_BIN, BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
|
||||
|
||||
const { BlockRules } = require("./util/blockrules");
|
||||
|
||||
|
||||
// ============================================================================
|
||||
class Crawler {
|
||||
constructor() {
|
||||
|
@ -55,7 +58,8 @@ class Crawler {
|
|||
|
||||
console.log("Seeds", this.params.scopedSeeds);
|
||||
|
||||
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
|
||||
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
||||
this.capturePrefix = this.captureBasePrerix + "/id_/";
|
||||
|
||||
this.gotoOpts = {
|
||||
waitUntil: this.params.waitUntil,
|
||||
|
@ -70,6 +74,9 @@ class Crawler {
|
|||
|
||||
// pages file
|
||||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
|
||||
this.blockRules = null;
|
||||
}
|
||||
|
||||
configureUA() {
|
||||
|
@ -298,6 +305,10 @@ class Crawler {
|
|||
|
||||
await this.initPages();
|
||||
|
||||
if (this.params.blockRules) {
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
|
||||
}
|
||||
|
||||
if (this.params.screencastPort) {
|
||||
this.screencaster = new ScreenCaster(this.cluster, this.params.screencastPort);
|
||||
}
|
||||
|
@ -378,6 +389,10 @@ class Crawler {
|
|||
return;
|
||||
}
|
||||
|
||||
if (this.blockRules) {
|
||||
await this.blockRules.initPage(page);
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(url, this.gotoOpts);
|
||||
} catch (e) {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
pywb>=2.6.0b3
|
||||
#git+https://github.com/webrecorder/pywb@main
|
||||
pywb>=2.6.0b4
|
||||
git+https://github.com/webrecorder/pywb@main
|
||||
uwsgi
|
||||
wacz>=0.3.0
|
||||
|
|
134
tests/blockrules.test.js
Normal file
134
tests/blockrules.test.js
Normal file
|
@ -0,0 +1,134 @@
|
|||
const yaml = require("js-yaml");
|
||||
const child_process = require("child_process");
|
||||
const fs = require("fs");
|
||||
|
||||
function runCrawl(name, config, commandExtra = "") {
|
||||
config.generateCDX = true;
|
||||
config.depth = 0;
|
||||
config.collection = name;
|
||||
|
||||
const configYaml = yaml.dump(config);
|
||||
|
||||
try {
|
||||
const version = require("../package.json").version;
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
function doesCDXContain(coll, value) {
|
||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
||||
return data.indexOf(value) >= 0;
|
||||
}
|
||||
|
||||
test("test crawl without block for specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.iana.org/",
|
||||
};
|
||||
|
||||
runCrawl("block-1-no-block", config);
|
||||
|
||||
// without blocks, URL with add sense is included
|
||||
expect(doesCDXContain("block-1-no-block", "https://cse.google.com/adsense/search/async-ads.js")).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
test("test block rule on specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.iana.org/",
|
||||
"blockRules": [
|
||||
{"url": "adsense"}
|
||||
]
|
||||
};
|
||||
|
||||
runCrawl("block-1", config);
|
||||
|
||||
expect(doesCDXContain("block-1", "https://cse.google.com/adsense/search/async-ads.js")).toBe(false);
|
||||
});
|
||||
|
||||
test("test block rule based on iframe text, content included due to match", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
"type": "allowOnly"
|
||||
}]
|
||||
};
|
||||
|
||||
runCrawl("block-2", config);
|
||||
|
||||
expect(doesCDXContain("block-2", "\"video/mp4\"")).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\\"",
|
||||
"type": "allowOnly"
|
||||
}]
|
||||
};
|
||||
|
||||
runCrawl("block-3", config);
|
||||
|
||||
expect(doesCDXContain("block-3", "\"video/mp4\"")).toBe(false);
|
||||
});
|
||||
|
||||
|
||||
test("test block rule based on iframe text, block matched", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
}]
|
||||
};
|
||||
|
||||
runCrawl("block-4", config);
|
||||
|
||||
expect(doesCDXContain("block-4", "\"video/mp4\"")).toBe(false);
|
||||
});
|
||||
|
||||
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "example.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
"type": "block"
|
||||
}, {
|
||||
"url": "(youtube.com|example.com)/embed/",
|
||||
"type": "allowOnly",
|
||||
"inFrameUrl": "oembed.link/",
|
||||
}]
|
||||
};
|
||||
|
||||
runCrawl("non-block-5", config);
|
||||
|
||||
expect(doesCDXContain("non-block-5", "\"video/mp4\"")).toBe(true);
|
||||
});
|
||||
|
||||
test("test block url in frame url", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "maxresdefault.jpg",
|
||||
"type": "block",
|
||||
"inFrameUrl": "youtube.com/embed",
|
||||
}]
|
||||
};
|
||||
|
||||
runCrawl("block-6", config);
|
||||
|
||||
expect(doesCDXContain("block-6", "\"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg\"")).toBe(false);
|
||||
});
|
||||
|
||||
|
||||
|
|
@ -90,6 +90,17 @@ class ArgParser {
|
|||
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||
},
|
||||
|
||||
"blockRules": {
|
||||
describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"blockMessage": {
|
||||
describe: "If specified, when a URL is blocked, a record with this error message is added instead",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"collection": {
|
||||
alias: "c",
|
||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
|
|
159
util/blockrules.js
Normal file
159
util/blockrules.js
Normal file
|
@ -0,0 +1,159 @@
|
|||
const fetch = require("node-fetch");
|
||||
|
||||
const RULE_TYPES = ["block", "allowOnly"];
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class BlockRule
|
||||
{
|
||||
constructor(data) {
|
||||
if (typeof(data) === "string") {
|
||||
this.url = new RegExp(data);
|
||||
this.type = "block";
|
||||
} else {
|
||||
this.url = data.url ? new RegExp(data.url) : null;
|
||||
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
|
||||
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
|
||||
this.type = data.type || "block";
|
||||
}
|
||||
|
||||
if (!RULE_TYPES.includes(this.type)) {
|
||||
throw new Error("Rule \"type\" must be: " + RULE_TYPES.join(", "));
|
||||
}
|
||||
}
|
||||
|
||||
toString() {
|
||||
return `\
|
||||
* Rule for URL Regex: ${this.url}
|
||||
Type: ${this.type}
|
||||
In Frame Regex: ${this.inFrameUrl ? this.inFrameUrl : "any"}
|
||||
Resource Type: ${this.frameTextMatch ? "frame" : "any"}
|
||||
${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
||||
`;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class BlockRules
|
||||
{
|
||||
constructor(blockRules, blockPutUrl, blockErrMsg) {
|
||||
this.rules = [];
|
||||
this.blockPutUrl = blockPutUrl;
|
||||
this.blockErrMsg = blockErrMsg;
|
||||
this.putUrlSet = new Set();
|
||||
|
||||
for (const ruleData of blockRules) {
|
||||
this.rules.push(new BlockRule(ruleData));
|
||||
}
|
||||
|
||||
console.log("URL Block Rules:\n");
|
||||
for (const rule of this.rules) {
|
||||
console.log(rule.toString());
|
||||
}
|
||||
}
|
||||
|
||||
async initPage(page) {
|
||||
await page.setRequestInterception(true);
|
||||
|
||||
page.on("request", (request) => this.handleRequest(request));
|
||||
}
|
||||
|
||||
async handleRequest(request) {
|
||||
const url = request.url();
|
||||
|
||||
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
||||
request.continue();
|
||||
return;
|
||||
}
|
||||
|
||||
for (const rule of this.rules) {
|
||||
const {done, block} = await this.shouldBlock(rule, request);
|
||||
|
||||
if (block) {
|
||||
//const frameUrl = request.frame().url();
|
||||
//console.log("Blocking/Aborting Request for: " + request.url());
|
||||
// not allowed, abort loading this response
|
||||
request.abort();
|
||||
await this.recordBlockMsg(request.url());
|
||||
return;
|
||||
}
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
request.continue();
|
||||
}
|
||||
|
||||
async shouldBlock(rule, request) {
|
||||
const reqUrl = request.url();
|
||||
|
||||
const {url, inFrameUrl, frameTextMatch} = rule;
|
||||
|
||||
const type = rule.type || "block";
|
||||
const allowOnly = (type === "allowOnly");
|
||||
|
||||
const frameUrl = request.frame().url();
|
||||
|
||||
// ignore initial page
|
||||
if (frameUrl === "about:blank") {
|
||||
return {block: false, done: true};
|
||||
}
|
||||
|
||||
// not a frame match, skip rule
|
||||
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
||||
return {block: false, done: false};
|
||||
}
|
||||
|
||||
const urlMatched = (url && reqUrl.match(url));
|
||||
|
||||
// if frame text-based rule: if url matched and a frame request
|
||||
// frame text-based match: only applies to nav requests, never block otherwise
|
||||
if (frameTextMatch) {
|
||||
if (!urlMatched || !request.isNavigationRequest()) {
|
||||
return {block: false, done: false};
|
||||
}
|
||||
|
||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
|
||||
return {block, done: true};
|
||||
}
|
||||
|
||||
// for non frame text rule, simply match by URL
|
||||
const block = urlMatched ? !allowOnly : allowOnly;
|
||||
return {block, done: false};
|
||||
}
|
||||
|
||||
async isTextMatch(request, reqUrl, frameTextMatch) {
|
||||
try {
|
||||
const res = await fetch(reqUrl);
|
||||
const text = await res.text();
|
||||
|
||||
return !!text.match(frameTextMatch);
|
||||
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
}
|
||||
|
||||
async recordBlockMsg(url) {
|
||||
if (!this.blockErrMsg || !this.blockPutUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.putUrlSet.has(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.putUrlSet.add(url);
|
||||
|
||||
const body = this.blockErrMsg;
|
||||
const putUrl = new URL(this.blockPutUrl);
|
||||
putUrl.searchParams.set("url", url);
|
||||
console.log("put url", putUrl.href);
|
||||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.BlockRules = BlockRules;
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue