BlockRules Fixes (0.4.3) (#75)

- blockrules fix: when checking an iframe nav request, match inFrameUrl against the parent iframe, not current one
- blockrules: cleanup, always allow 'pywb.proxy' static files
- logging: when 'debug' logging enabled, log urls blocked and conditional iframe checks from blockrules
- tests: add more complex test for blockrules
- update CHANGES and support info in README
- bump to 0.4.3
This commit is contained in:
Ilya Kreymer 2021-07-27 09:41:21 -07:00 committed by GitHub
parent f0c5ca1035
commit be1ee53c3e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 85 additions and 26 deletions

View file

@ -1,5 +1,10 @@
## CHANGES ## CHANGES
v0.4.3
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
- BlockRules Fixes: Always allow pywb proxy scripts.
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
v0.4.2 v0.4.2
- Compose/docs: Build latest image by default, update README to refer to latest image - Compose/docs: Build latest image by default, update README to refer to latest image
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing - Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing

View file

@ -484,10 +484,9 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
Support Support
------- -------
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/) Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
Initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
License License

View file

@ -329,7 +329,7 @@ class Crawler {
await this.initPages(); await this.initPages();
if (this.params.blockRules && this.params.blockRules.length) { if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage); this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
} }
if (this.params.screencastPort) { if (this.params.screencastPort) {

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "0.4.2", "version": "0.4.3",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software", "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",

View file

@ -130,4 +130,36 @@ test("test block url in frame url", () => {
}); });
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
const config = {
"seeds": [
"https://archiveweb.page/guide/troubleshooting/errors.html",
],
"depth": "0",
"blockRules": [{
"url": "(archiveweb.page|www.youtube.com)",
"type": "allowOnly",
"inFrameUrl": "archiveweb.page"
}, {
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
"inFrameUrl": "archiveweb.page"
}, {
"url": "https://www.youtube.com/embed/",
"type": "allowOnly",
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
}],
"combineWARC": true,
"logging": "stats,debug"
};
runCrawl("block-7", config);
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
});

View file

@ -2,6 +2,8 @@ const fetch = require("node-fetch");
const RULE_TYPES = ["block", "allowOnly"]; const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
// =========================================================================== // ===========================================================================
class BlockRule class BlockRule
@ -37,10 +39,11 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// =========================================================================== // ===========================================================================
class BlockRules class BlockRules
{ {
constructor(blockRules, blockPutUrl, blockErrMsg) { constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
this.rules = []; this.rules = [];
this.blockPutUrl = blockPutUrl; this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg; this.blockErrMsg = blockErrMsg;
this.debugLog = debugLog;
this.putUrlSet = new Set(); this.putUrlSet = new Set();
for (const ruleData of blockRules) { for (const ruleData of blockRules) {
@ -48,9 +51,9 @@ class BlockRules
} }
if (this.rules.length) { if (this.rules.length) {
console.log("URL Block Rules:\n"); this.debugLog("URL Block Rules:\n");
for (const rule of this.rules) { for (const rule of this.rules) {
console.log(rule.toString()); this.debugLog(rule.toString());
} }
} }
} }
@ -79,15 +82,20 @@ class BlockRules
return; return;
} }
// always allow special pywb proxy script
for (const allowUrl of ALWAYS_ALLOW) {
if (url.startsWith(allowUrl)) {
request.continue();
return;
}
}
for (const rule of this.rules) { for (const rule of this.rules) {
const {done, block} = await this.shouldBlock(rule, request); const {done, block, frameUrl} = await this.shouldBlock(rule, request, url);
if (block) { if (block) {
//const frameUrl = request.frame().url();
//console.log("Blocking/Aborting Request for: " + request.url());
// not allowed, abort loading this response
request.abort(); request.abort();
await this.recordBlockMsg(request.url()); await this.recordBlockMsg(url, frameUrl);
return; return;
} }
if (done) { if (done) {
@ -98,24 +106,37 @@ class BlockRules
request.continue(); request.continue();
} }
async shouldBlock(rule, request) { async shouldBlock(rule, request, reqUrl) {
const reqUrl = request.url();
const {url, inFrameUrl, frameTextMatch} = rule; const {url, inFrameUrl, frameTextMatch} = rule;
const type = rule.type || "block"; const type = rule.type || "block";
const allowOnly = (type === "allowOnly"); const allowOnly = (type === "allowOnly");
const frameUrl = request.frame().url(); const isNavReq = request.isNavigationRequest();
const frame = request.frame();
let frameUrl = null;
if (isNavReq) {
const parentFrame = frame.parentFrame();
if (parentFrame) {
frameUrl = parentFrame.url();
} else {
frameUrl = frame.url();
}
} else {
frameUrl = frame.url();
}
// ignore initial page // ignore initial page
if (frameUrl === "about:blank") { if (frameUrl === "about:blank") {
return {block: false, done: true}; return {block: false, done: true, frameUrl};
} }
// not a frame match, skip rule // not a frame match, skip rule
if (inFrameUrl && !frameUrl.match(inFrameUrl)) { if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
return {block: false, done: false}; return {block: false, done: false, frameUrl};
} }
const urlMatched = (url && reqUrl.match(url)); const urlMatched = (url && reqUrl.match(url));
@ -123,17 +144,18 @@ class BlockRules
// if frame text-based rule: if url matched and a frame request // if frame text-based rule: if url matched and a frame request
// frame text-based match: only applies to nav requests, never block otherwise // frame text-based match: only applies to nav requests, never block otherwise
if (frameTextMatch) { if (frameTextMatch) {
if (!urlMatched || !request.isNavigationRequest()) { if (!urlMatched || !isNavReq) {
return {block: false, done: false}; return {block: false, done: false, frameUrl};
} }
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly; const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
return {block, done: true}; this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
return {block, done: true, frameUrl};
} }
// for non frame text rule, simply match by URL // for non frame text rule, simply match by URL
const block = urlMatched ? !allowOnly : allowOnly; const block = urlMatched ? !allowOnly : allowOnly;
return {block, done: false}; return {block, done: false, frameUrl};
} }
async isTextMatch(request, reqUrl, frameTextMatch) { async isTextMatch(request, reqUrl, frameTextMatch) {
@ -144,11 +166,13 @@ class BlockRules
return !!text.match(frameTextMatch); return !!text.match(frameTextMatch);
} catch (e) { } catch (e) {
console.log(e); this.debugLog(e);
} }
} }
async recordBlockMsg(url) { async recordBlockMsg(url, frameUrl) {
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
if (!this.blockErrMsg || !this.blockPutUrl) { if (!this.blockErrMsg || !this.blockPutUrl) {
return; return;
} }
@ -162,7 +186,6 @@ class BlockRules
const body = this.blockErrMsg; const body = this.blockErrMsg;
const putUrl = new URL(this.blockPutUrl); const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url); putUrl.searchParams.set("url", url);
//console.log("put url", putUrl.href);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body}); await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
} }
} }