browsertrix-crawler/util/blockrules.js
Ilya Kreymer f4c6b6a99f
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set

* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages

* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)

* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile

* screencasting: convert newContext to window instead of page by default, instead of just warning about it

* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2

* seeds: add trim() to seed URLs

* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically

* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles

* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25

* update CHANGES and README with new features

* bump version to 0.4.1
2021-07-22 14:24:51 -07:00

171 lines
4.3 KiB
JavaScript

const fetch = require("node-fetch");
const RULE_TYPES = ["block", "allowOnly"];
// ===========================================================================
class BlockRule
{
constructor(data) {
if (typeof(data) === "string") {
this.url = new RegExp(data);
this.type = "block";
} else {
this.url = data.url ? new RegExp(data.url) : null;
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
this.type = data.type || "block";
}
if (!RULE_TYPES.includes(this.type)) {
throw new Error("Rule \"type\" must be: " + RULE_TYPES.join(", "));
}
}
toString() {
return `\
* Rule for URL Regex: ${this.url}
Type: ${this.type}
In Frame Regex: ${this.inFrameUrl ? this.inFrameUrl : "any"}
Resource Type: ${this.frameTextMatch ? "frame" : "any"}
${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
`;
}
}
// ===========================================================================
class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.putUrlSet = new Set();
for (const ruleData of blockRules) {
this.rules.push(new BlockRule(ruleData));
}
if (this.rules.length) {
console.log("URL Block Rules:\n");
for (const rule of this.rules) {
console.log(rule.toString());
}
}
}
async initPage(page) {
if (!this.rules.length) {
return;
}
await page.setRequestInterception(true);
page.on("request", async (request) => {
try {
await this.handleRequest(request);
} catch (e) {
console.warn(e);
}
});
}
async handleRequest(request) {
const url = request.url();
if (!url.startsWith("http:") && !url.startsWith("https:")) {
request.continue();
return;
}
for (const rule of this.rules) {
const {done, block} = await this.shouldBlock(rule, request);
if (block) {
//const frameUrl = request.frame().url();
//console.log("Blocking/Aborting Request for: " + request.url());
// not allowed, abort loading this response
request.abort();
await this.recordBlockMsg(request.url());
return;
}
if (done) {
break;
}
}
request.continue();
}
async shouldBlock(rule, request) {
const reqUrl = request.url();
const {url, inFrameUrl, frameTextMatch} = rule;
const type = rule.type || "block";
const allowOnly = (type === "allowOnly");
const frameUrl = request.frame().url();
// ignore initial page
if (frameUrl === "about:blank") {
return {block: false, done: true};
}
// not a frame match, skip rule
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
return {block: false, done: false};
}
const urlMatched = (url && reqUrl.match(url));
// if frame text-based rule: if url matched and a frame request
// frame text-based match: only applies to nav requests, never block otherwise
if (frameTextMatch) {
if (!urlMatched || !request.isNavigationRequest()) {
return {block: false, done: false};
}
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
return {block, done: true};
}
// for non frame text rule, simply match by URL
const block = urlMatched ? !allowOnly : allowOnly;
return {block, done: false};
}
async isTextMatch(request, reqUrl, frameTextMatch) {
try {
const res = await fetch(reqUrl);
const text = await res.text();
return !!text.match(frameTextMatch);
} catch (e) {
console.log(e);
}
}
async recordBlockMsg(url) {
if (!this.blockErrMsg || !this.blockPutUrl) {
return;
}
if (this.putUrlSet.has(url)) {
return;
}
this.putUrlSet.add(url);
const body = this.blockErrMsg;
const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url);
//console.log("put url", putUrl.href);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}
module.exports.BlockRules = BlockRules;