browsertrix-crawler/util/blockrules.js

244 lines
6.8 KiB
JavaScript
Raw Normal View History

import fs from "fs";
import { logger, errJSON } from "./logger.js";
const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
const BlockState = {
ALLOW: null,
BLOCK_PAGE_NAV: "page",
BLOCK_IFRAME_NAV: "iframe",
BLOCK_OTHER: "resource",
BLOCK_AD: "advertisement"
};
// ===========================================================================
class BlockRule
{
constructor(data) {
if (typeof(data) === "string") {
this.url = new RegExp(data);
this.type = "block";
} else {
this.url = data.url ? new RegExp(data.url) : null;
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
this.type = data.type || "block";
}
if (!RULE_TYPES.includes(this.type)) {
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
}
}
toString() {
return `\
* Rule for URL Regex: ${this.url}
Type: ${this.type}
In Frame Regex: ${this.inFrameUrl ? this.inFrameUrl : "any"}
Resource Type: ${this.frameTextMatch ? "frame" : "any"}
${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
`;
}
}
// ===========================================================================
export class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.blockedUrlSet = new Set();
for (const ruleData of blockRules) {
this.rules.push(new BlockRule(ruleData));
}
if (this.rules.length) {
logger.debug("URL Block Rules:\n", {}, "blocking");
for (const rule of this.rules) {
logger.debug(rule.toString(), {}, "blocking");
}
}
}
async initPage(browser, page) {
const onRequest = async (request) => {
const logDetails = {page: page.url()};
try {
await this.handleRequest(request, logDetails);
} catch (e) {
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
}
};
await browser.interceptRequest(page, onRequest);
}
async handleRequest(request, logDetails) {
const url = request.url();
let blockState;
try {
blockState = await this.shouldBlock(request, url, logDetails);
if (blockState === BlockState.ALLOW) {
await request.continue({}, 1);
} else {
await request.abort("blockedbyclient", 1);
}
} catch (e) {
logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...errJSON(e), ...logDetails}, "blocking");
}
}
async shouldBlock(request, url, logDetails) {
if (!url.startsWith("http:") && !url.startsWith("https:")) {
return BlockState.ALLOW;
}
const isNavReq = request.isNavigationRequest();
const frame = request.frame();
let frameUrl = "";
let blockState;
if (isNavReq) {
const parentFrame = frame.parentFrame();
if (parentFrame) {
frameUrl = parentFrame.url();
blockState = BlockState.BLOCK_IFRAME_NAV;
} else {
frameUrl = frame.url();
blockState = BlockState.BLOCK_PAGE_NAV;
}
} else {
frameUrl = frame ? frame.url() : "";
blockState = BlockState.BLOCK_OTHER;
}
// ignore initial page
if (frameUrl === "about:blank") {
return BlockState.ALLOW;
}
// always allow special pywb proxy script
for (const allowUrl of ALWAYS_ALLOW) {
if (url.startsWith(allowUrl)) {
return BlockState.ALLOW;
}
}
for (const rule of this.rules) {
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails);
if (block) {
if (blockState === BlockState.BLOCK_PAGE_NAV) {
logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
return BlockState.ALLOW;
}
logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking");
await this.recordBlockMsg(url);
return blockState;
}
if (done) {
break;
}
}
return BlockState.ALLOW;
}
async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq, logDetails) {
const {url, inFrameUrl, frameTextMatch} = rule;
const type = rule.type || "block";
const allowOnly = (type === "allowOnly");
// not a frame match, skip rule
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
return {block: false, done: false};
}
const urlMatched = (url && reqUrl.match(url));
// if frame text-based rule: if url matched and a frame request
// frame text-based match: only applies to nav requests, never block otherwise
if (frameTextMatch) {
if (!urlMatched || !isNavReq) {
return {block: false, done: false};
}
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
return {block, done: true};
}
// for non frame text rule, simply match by URL
const block = urlMatched ? !allowOnly : allowOnly;
return {block, done: false};
}
async isTextMatch(request, reqUrl, frameTextMatch, logDetails) {
try {
const res = await fetch(reqUrl);
const text = await res.text();
return !!text.match(frameTextMatch);
} catch (e) {
logger.debug("Error determining text match", {...errJSON(e), ...logDetails}, "blocking");
}
}
async recordBlockMsg(url) {
if (this.blockedUrlSet.has(url)) {
return;
}
this.blockedUrlSet.add(url);
if (!this.blockErrMsg || !this.blockPutUrl) {
return;
}
const body = this.blockErrMsg;
const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}
// ===========================================================================
export class AdBlockRules extends BlockRules
{
constructor(blockPutUrl, blockErrMsg, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
}
Remove puppeteer-cluster + iframe filtering + health check refactor + logging improvements (0.9.0-beta.0) (#219) * This commit removes puppeteer-cluster as a dependency in favor of a simpler concurrency implementation, using p-queue to limit concurrency to the number of available workers. As part of the refactor, the custom window concurrency model in windowconcur.js is removed and its logic implemented in the new Worker class's initPage method. * Remove concurrency models, always use new tab * logging improvements: include worker-id in logs, use 'worker' context - logging: log info string / version as first line - logging: improve logging of error stack traces - interruption: support interrupting crawl directly with 'interrupt' check which stops the job queue - interruption: don't repair if interrupting, wait for queue to be idle - log text extraction - init order: ensure wb-manager init called first, then logs created - logging: adjust info->debug logging - Log no jobs available as debug * tests: bail on first failure * iframe filtering: - fix filtering for about:blank iframes, support non-async shouldProcessFrame() - filter iframes both for behaviors and for link extraction - add 5-second timeout to link extraction, to avoid link extraction holding up crawl! - cache filtered frames * healthcheck/worker reuse: - refactor healthchecker into separate class - increment healthchecker (if provided) if new page load fails - remove expermeintal repair functionality for now - add healthcheck * deps: bump puppeteer-core to 17.1.2 - bump to 0.9.0-beta.0 -------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-03-08 21:31:19 -05:00
isAdUrl(url) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
Remove puppeteer-cluster + iframe filtering + health check refactor + logging improvements (0.9.0-beta.0) (#219) * This commit removes puppeteer-cluster as a dependency in favor of a simpler concurrency implementation, using p-queue to limit concurrency to the number of available workers. As part of the refactor, the custom window concurrency model in windowconcur.js is removed and its logic implemented in the new Worker class's initPage method. * Remove concurrency models, always use new tab * logging improvements: include worker-id in logs, use 'worker' context - logging: log info string / version as first line - logging: improve logging of error stack traces - interruption: support interrupting crawl directly with 'interrupt' check which stops the job queue - interruption: don't repair if interrupting, wait for queue to be idle - log text extraction - init order: ensure wb-manager init called first, then logs created - logging: adjust info->debug logging - Log no jobs available as debug * tests: bail on first failure * iframe filtering: - fix filtering for about:blank iframes, support non-async shouldProcessFrame() - filter iframes both for behaviors and for link extraction - add 5-second timeout to link extraction, to avoid link extraction holding up crawl! - cache filtered frames * healthcheck/worker reuse: - refactor healthchecker into separate class - increment healthchecker (if provided) if new page load fails - remove expermeintal repair functionality for now - add healthcheck * deps: bump puppeteer-core to 17.1.2 - bump to 0.9.0-beta.0 -------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-03-08 21:31:19 -05:00
return this.adhosts.includes(domain);
}
async shouldBlock(request, url, logDetails) {
if (this.isAdUrl(url)) {
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
await this.recordBlockMsg(url);
return BlockState.BLOCK_AD;
}
return BlockState.ALLOW;
}
}