mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
BlockRules Fixes (0.4.3) (#75)
- blockrules fix: when checking an iframe nav request, match inFrameUrl against the parent iframe, not current one - blockrules: cleanup, always allow 'pywb.proxy' static files - logging: when 'debug' logging enabled, log urls blocked and conditional iframe checks from blockrules - tests: add more complex test for blockrules - update CHANGES and support info in README - bump to 0.4.3
This commit is contained in:
parent
f0c5ca1035
commit
be1ee53c3e
6 changed files with 85 additions and 26 deletions
|
@ -1,5 +1,10 @@
|
|||
## CHANGES
|
||||
|
||||
v0.4.3
|
||||
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
|
||||
- BlockRules Fixes: Always allow pywb proxy scripts.
|
||||
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
|
||||
|
||||
v0.4.2
|
||||
- Compose/docs: Build latest image by default, update README to refer to latest image
|
||||
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
||||
|
|
|
@ -484,10 +484,9 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
|
|||
Support
|
||||
-------
|
||||
|
||||
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/)
|
||||
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
||||
|
||||
Initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between
|
||||
Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
||||
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
|
||||
|
||||
|
||||
License
|
||||
|
|
|
@ -329,7 +329,7 @@ class Crawler {
|
|||
await this.initPages();
|
||||
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
|
||||
}
|
||||
|
||||
if (this.params.screencastPort) {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.4.2",
|
||||
"version": "0.4.3",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
|
@ -130,4 +130,36 @@ test("test block url in frame url", () => {
|
|||
});
|
||||
|
||||
|
||||
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
||||
const config = {
|
||||
"seeds": [
|
||||
"https://archiveweb.page/guide/troubleshooting/errors.html",
|
||||
],
|
||||
"depth": "0",
|
||||
"blockRules": [{
|
||||
"url": "(archiveweb.page|www.youtube.com)",
|
||||
"type": "allowOnly",
|
||||
"inFrameUrl": "archiveweb.page"
|
||||
}, {
|
||||
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
||||
"inFrameUrl": "archiveweb.page"
|
||||
}, {
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"type": "allowOnly",
|
||||
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
|
||||
}],
|
||||
|
||||
"combineWARC": true,
|
||||
|
||||
"logging": "stats,debug"
|
||||
};
|
||||
|
||||
|
||||
runCrawl("block-7", config);
|
||||
|
||||
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
|
||||
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,8 @@ const fetch = require("node-fetch");
|
|||
|
||||
const RULE_TYPES = ["block", "allowOnly"];
|
||||
|
||||
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class BlockRule
|
||||
|
@ -37,10 +39,11 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
|||
// ===========================================================================
|
||||
class BlockRules
|
||||
{
|
||||
constructor(blockRules, blockPutUrl, blockErrMsg) {
|
||||
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
|
||||
this.rules = [];
|
||||
this.blockPutUrl = blockPutUrl;
|
||||
this.blockErrMsg = blockErrMsg;
|
||||
this.debugLog = debugLog;
|
||||
this.putUrlSet = new Set();
|
||||
|
||||
for (const ruleData of blockRules) {
|
||||
|
@ -48,9 +51,9 @@ class BlockRules
|
|||
}
|
||||
|
||||
if (this.rules.length) {
|
||||
console.log("URL Block Rules:\n");
|
||||
this.debugLog("URL Block Rules:\n");
|
||||
for (const rule of this.rules) {
|
||||
console.log(rule.toString());
|
||||
this.debugLog(rule.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -79,15 +82,20 @@ class BlockRules
|
|||
return;
|
||||
}
|
||||
|
||||
// always allow special pywb proxy script
|
||||
for (const allowUrl of ALWAYS_ALLOW) {
|
||||
if (url.startsWith(allowUrl)) {
|
||||
request.continue();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (const rule of this.rules) {
|
||||
const {done, block} = await this.shouldBlock(rule, request);
|
||||
const {done, block, frameUrl} = await this.shouldBlock(rule, request, url);
|
||||
|
||||
if (block) {
|
||||
//const frameUrl = request.frame().url();
|
||||
//console.log("Blocking/Aborting Request for: " + request.url());
|
||||
// not allowed, abort loading this response
|
||||
request.abort();
|
||||
await this.recordBlockMsg(request.url());
|
||||
await this.recordBlockMsg(url, frameUrl);
|
||||
return;
|
||||
}
|
||||
if (done) {
|
||||
|
@ -98,24 +106,37 @@ class BlockRules
|
|||
request.continue();
|
||||
}
|
||||
|
||||
async shouldBlock(rule, request) {
|
||||
const reqUrl = request.url();
|
||||
|
||||
async shouldBlock(rule, request, reqUrl) {
|
||||
const {url, inFrameUrl, frameTextMatch} = rule;
|
||||
|
||||
const type = rule.type || "block";
|
||||
const allowOnly = (type === "allowOnly");
|
||||
|
||||
const frameUrl = request.frame().url();
|
||||
const isNavReq = request.isNavigationRequest();
|
||||
|
||||
const frame = request.frame();
|
||||
|
||||
let frameUrl = null;
|
||||
|
||||
if (isNavReq) {
|
||||
const parentFrame = frame.parentFrame();
|
||||
if (parentFrame) {
|
||||
frameUrl = parentFrame.url();
|
||||
} else {
|
||||
frameUrl = frame.url();
|
||||
}
|
||||
} else {
|
||||
frameUrl = frame.url();
|
||||
}
|
||||
|
||||
// ignore initial page
|
||||
if (frameUrl === "about:blank") {
|
||||
return {block: false, done: true};
|
||||
return {block: false, done: true, frameUrl};
|
||||
}
|
||||
|
||||
// not a frame match, skip rule
|
||||
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
||||
return {block: false, done: false};
|
||||
return {block: false, done: false, frameUrl};
|
||||
}
|
||||
|
||||
const urlMatched = (url && reqUrl.match(url));
|
||||
|
@ -123,17 +144,18 @@ class BlockRules
|
|||
// if frame text-based rule: if url matched and a frame request
|
||||
// frame text-based match: only applies to nav requests, never block otherwise
|
||||
if (frameTextMatch) {
|
||||
if (!urlMatched || !request.isNavigationRequest()) {
|
||||
return {block: false, done: false};
|
||||
if (!urlMatched || !isNavReq) {
|
||||
return {block: false, done: false, frameUrl};
|
||||
}
|
||||
|
||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
|
||||
return {block, done: true};
|
||||
this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
|
||||
return {block, done: true, frameUrl};
|
||||
}
|
||||
|
||||
// for non frame text rule, simply match by URL
|
||||
const block = urlMatched ? !allowOnly : allowOnly;
|
||||
return {block, done: false};
|
||||
return {block, done: false, frameUrl};
|
||||
}
|
||||
|
||||
async isTextMatch(request, reqUrl, frameTextMatch) {
|
||||
|
@ -144,11 +166,13 @@ class BlockRules
|
|||
return !!text.match(frameTextMatch);
|
||||
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
this.debugLog(e);
|
||||
}
|
||||
}
|
||||
|
||||
async recordBlockMsg(url) {
|
||||
async recordBlockMsg(url, frameUrl) {
|
||||
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
|
||||
|
||||
if (!this.blockErrMsg || !this.blockPutUrl) {
|
||||
return;
|
||||
}
|
||||
|
@ -162,7 +186,6 @@ class BlockRules
|
|||
const body = this.blockErrMsg;
|
||||
const putUrl = new URL(this.blockPutUrl);
|
||||
putUrl.searchParams.set("url", url);
|
||||
//console.log("put url", putUrl.href);
|
||||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue