mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
BlockRules Fixes (0.4.3) (#75)
- blockrules fix: when checking an iframe nav request, match inFrameUrl against the parent iframe, not current one - blockrules: cleanup, always allow 'pywb.proxy' static files - logging: when 'debug' logging enabled, log urls blocked and conditional iframe checks from blockrules - tests: add more complex test for blockrules - update CHANGES and support info in README - bump to 0.4.3
This commit is contained in:
parent
f0c5ca1035
commit
be1ee53c3e
6 changed files with 85 additions and 26 deletions
|
@ -1,5 +1,10 @@
|
||||||
## CHANGES
|
## CHANGES
|
||||||
|
|
||||||
|
v0.4.3
|
||||||
|
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
|
||||||
|
- BlockRules Fixes: Always allow pywb proxy scripts.
|
||||||
|
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
|
||||||
|
|
||||||
v0.4.2
|
v0.4.2
|
||||||
- Compose/docs: Build latest image by default, update README to refer to latest image
|
- Compose/docs: Build latest image by default, update README to refer to latest image
|
||||||
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
||||||
|
|
|
@ -484,10 +484,9 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
|
||||||
Support
|
Support
|
||||||
-------
|
-------
|
||||||
|
|
||||||
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/)
|
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
||||||
|
|
||||||
Initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between
|
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
|
||||||
Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
|
||||||
|
|
||||||
|
|
||||||
License
|
License
|
||||||
|
|
|
@ -329,7 +329,7 @@ class Crawler {
|
||||||
await this.initPages();
|
await this.initPages();
|
||||||
|
|
||||||
if (this.params.blockRules && this.params.blockRules.length) {
|
if (this.params.blockRules && this.params.blockRules.length) {
|
||||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
|
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.screencastPort) {
|
if (this.params.screencastPort) {
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.4.2",
|
"version": "0.4.3",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
|
|
|
@ -130,4 +130,36 @@ test("test block url in frame url", () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
||||||
|
const config = {
|
||||||
|
"seeds": [
|
||||||
|
"https://archiveweb.page/guide/troubleshooting/errors.html",
|
||||||
|
],
|
||||||
|
"depth": "0",
|
||||||
|
"blockRules": [{
|
||||||
|
"url": "(archiveweb.page|www.youtube.com)",
|
||||||
|
"type": "allowOnly",
|
||||||
|
"inFrameUrl": "archiveweb.page"
|
||||||
|
}, {
|
||||||
|
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
||||||
|
"inFrameUrl": "archiveweb.page"
|
||||||
|
}, {
|
||||||
|
"url": "https://www.youtube.com/embed/",
|
||||||
|
"type": "allowOnly",
|
||||||
|
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
|
||||||
|
}],
|
||||||
|
|
||||||
|
"combineWARC": true,
|
||||||
|
|
||||||
|
"logging": "stats,debug"
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
runCrawl("block-7", config);
|
||||||
|
|
||||||
|
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
|
||||||
|
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,8 @@ const fetch = require("node-fetch");
|
||||||
|
|
||||||
const RULE_TYPES = ["block", "allowOnly"];
|
const RULE_TYPES = ["block", "allowOnly"];
|
||||||
|
|
||||||
|
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class BlockRule
|
class BlockRule
|
||||||
|
@ -37,10 +39,11 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class BlockRules
|
class BlockRules
|
||||||
{
|
{
|
||||||
constructor(blockRules, blockPutUrl, blockErrMsg) {
|
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
|
||||||
this.rules = [];
|
this.rules = [];
|
||||||
this.blockPutUrl = blockPutUrl;
|
this.blockPutUrl = blockPutUrl;
|
||||||
this.blockErrMsg = blockErrMsg;
|
this.blockErrMsg = blockErrMsg;
|
||||||
|
this.debugLog = debugLog;
|
||||||
this.putUrlSet = new Set();
|
this.putUrlSet = new Set();
|
||||||
|
|
||||||
for (const ruleData of blockRules) {
|
for (const ruleData of blockRules) {
|
||||||
|
@ -48,9 +51,9 @@ class BlockRules
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.rules.length) {
|
if (this.rules.length) {
|
||||||
console.log("URL Block Rules:\n");
|
this.debugLog("URL Block Rules:\n");
|
||||||
for (const rule of this.rules) {
|
for (const rule of this.rules) {
|
||||||
console.log(rule.toString());
|
this.debugLog(rule.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -79,15 +82,20 @@ class BlockRules
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// always allow special pywb proxy script
|
||||||
|
for (const allowUrl of ALWAYS_ALLOW) {
|
||||||
|
if (url.startsWith(allowUrl)) {
|
||||||
|
request.continue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (const rule of this.rules) {
|
for (const rule of this.rules) {
|
||||||
const {done, block} = await this.shouldBlock(rule, request);
|
const {done, block, frameUrl} = await this.shouldBlock(rule, request, url);
|
||||||
|
|
||||||
if (block) {
|
if (block) {
|
||||||
//const frameUrl = request.frame().url();
|
|
||||||
//console.log("Blocking/Aborting Request for: " + request.url());
|
|
||||||
// not allowed, abort loading this response
|
|
||||||
request.abort();
|
request.abort();
|
||||||
await this.recordBlockMsg(request.url());
|
await this.recordBlockMsg(url, frameUrl);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (done) {
|
if (done) {
|
||||||
|
@ -98,24 +106,37 @@ class BlockRules
|
||||||
request.continue();
|
request.continue();
|
||||||
}
|
}
|
||||||
|
|
||||||
async shouldBlock(rule, request) {
|
async shouldBlock(rule, request, reqUrl) {
|
||||||
const reqUrl = request.url();
|
|
||||||
|
|
||||||
const {url, inFrameUrl, frameTextMatch} = rule;
|
const {url, inFrameUrl, frameTextMatch} = rule;
|
||||||
|
|
||||||
const type = rule.type || "block";
|
const type = rule.type || "block";
|
||||||
const allowOnly = (type === "allowOnly");
|
const allowOnly = (type === "allowOnly");
|
||||||
|
|
||||||
const frameUrl = request.frame().url();
|
const isNavReq = request.isNavigationRequest();
|
||||||
|
|
||||||
|
const frame = request.frame();
|
||||||
|
|
||||||
|
let frameUrl = null;
|
||||||
|
|
||||||
|
if (isNavReq) {
|
||||||
|
const parentFrame = frame.parentFrame();
|
||||||
|
if (parentFrame) {
|
||||||
|
frameUrl = parentFrame.url();
|
||||||
|
} else {
|
||||||
|
frameUrl = frame.url();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
frameUrl = frame.url();
|
||||||
|
}
|
||||||
|
|
||||||
// ignore initial page
|
// ignore initial page
|
||||||
if (frameUrl === "about:blank") {
|
if (frameUrl === "about:blank") {
|
||||||
return {block: false, done: true};
|
return {block: false, done: true, frameUrl};
|
||||||
}
|
}
|
||||||
|
|
||||||
// not a frame match, skip rule
|
// not a frame match, skip rule
|
||||||
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
||||||
return {block: false, done: false};
|
return {block: false, done: false, frameUrl};
|
||||||
}
|
}
|
||||||
|
|
||||||
const urlMatched = (url && reqUrl.match(url));
|
const urlMatched = (url && reqUrl.match(url));
|
||||||
|
@ -123,17 +144,18 @@ class BlockRules
|
||||||
// if frame text-based rule: if url matched and a frame request
|
// if frame text-based rule: if url matched and a frame request
|
||||||
// frame text-based match: only applies to nav requests, never block otherwise
|
// frame text-based match: only applies to nav requests, never block otherwise
|
||||||
if (frameTextMatch) {
|
if (frameTextMatch) {
|
||||||
if (!urlMatched || !request.isNavigationRequest()) {
|
if (!urlMatched || !isNavReq) {
|
||||||
return {block: false, done: false};
|
return {block: false, done: false, frameUrl};
|
||||||
}
|
}
|
||||||
|
|
||||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
|
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
|
||||||
return {block, done: true};
|
this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
|
||||||
|
return {block, done: true, frameUrl};
|
||||||
}
|
}
|
||||||
|
|
||||||
// for non frame text rule, simply match by URL
|
// for non frame text rule, simply match by URL
|
||||||
const block = urlMatched ? !allowOnly : allowOnly;
|
const block = urlMatched ? !allowOnly : allowOnly;
|
||||||
return {block, done: false};
|
return {block, done: false, frameUrl};
|
||||||
}
|
}
|
||||||
|
|
||||||
async isTextMatch(request, reqUrl, frameTextMatch) {
|
async isTextMatch(request, reqUrl, frameTextMatch) {
|
||||||
|
@ -144,11 +166,13 @@ class BlockRules
|
||||||
return !!text.match(frameTextMatch);
|
return !!text.match(frameTextMatch);
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.log(e);
|
this.debugLog(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async recordBlockMsg(url) {
|
async recordBlockMsg(url, frameUrl) {
|
||||||
|
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
|
||||||
|
|
||||||
if (!this.blockErrMsg || !this.blockPutUrl) {
|
if (!this.blockErrMsg || !this.blockPutUrl) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -162,7 +186,6 @@ class BlockRules
|
||||||
const body = this.blockErrMsg;
|
const body = this.blockErrMsg;
|
||||||
const putUrl = new URL(this.blockPutUrl);
|
const putUrl = new URL(this.blockPutUrl);
|
||||||
putUrl.searchParams.set("url", url);
|
putUrl.searchParams.set("url", url);
|
||||||
//console.log("put url", putUrl.href);
|
|
||||||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue