browsertrix-crawler/util/blockrules.js

275 lines
7.4 KiB
JavaScript
Raw Normal View History

import fs from "fs";
import { logger, errJSON } from "./logger.js";
const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
const BlockState = {
ALLOW: null,
BLOCK_PAGE_NAV: "page",
BLOCK_IFRAME_NAV: "iframe",
BLOCK_OTHER: "resource",
BLOCK_AD: "advertisement"
};
// ===========================================================================
class BlockRule
{
constructor(data) {
if (typeof(data) === "string") {
this.url = new RegExp(data);
this.type = "block";
} else {
this.url = data.url ? new RegExp(data.url) : null;
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
this.type = data.type || "block";
}
if (!RULE_TYPES.includes(this.type)) {
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
}
}
toString() {
return `\
* Rule for URL Regex: ${this.url}
Type: ${this.type}
In Frame Regex: ${this.inFrameUrl ? this.inFrameUrl : "any"}
Resource Type: ${this.frameTextMatch ? "frame" : "any"}
${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
`;
}
}
// ===========================================================================
export class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.blockedUrlSet = new Set();
for (const ruleData of blockRules) {
this.rules.push(new BlockRule(ruleData));
}
if (this.rules.length) {
logger.debug("URL Block Rules:\n", {}, "blocking");
for (const rule of this.rules) {
logger.debug(rule.toString(), {}, "blocking");
}
}
}
async initPage(page, cdp, browser) {
if (!this.rules.length) {
return;
}
if (page._btrix_interceptionAdded) {
return true;
}
page._btrix_interceptionAdded = true;
//await page.route("**/*", (route) => {
await browser.addIntercept(cdp, (route) => {
const logDetails = {page: page.url()};
try {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
this.handleRequest(route, logDetails);
} catch (e) {
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
}
});
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
async handleRequest(route, logDetails) {
const request = route.request();
const url = request.url();
let blockState;
try {
blockState = await this.shouldBlock(request, url, logDetails);
if (blockState === BlockState.ALLOW) {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await route.continue();
} else {
await route.abort("BlockedByClient");
}
} catch (e) {
logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...errJSON(e), ...logDetails}, "blocking");
}
}
async shouldBlock(request, url, logDetails) {
if (!url.startsWith("http:") && !url.startsWith("https:")) {
return BlockState.ALLOW;
}
const isNavReq = request.isNavigationRequest();
//const frame = request.frame();
let frameUrl = "";
let blockState;
if (isNavReq) {
//const parentFrame = frame.parentFrame();
//if (parentFrame) {
//frameUrl = parentFrame.url();
//blockState = BlockState.BLOCK_IFRAME_NAV;
//} else {
//frameUrl = frame.url();
blockState = BlockState.BLOCK_PAGE_NAV;
//}
} else {
//frameUrl = frame ? frame.url() : "";
blockState = BlockState.BLOCK_OTHER;
}
// ignore initial page
// if (frameUrl === "about:blank") {
// return BlockState.ALLOW;
// }
// always allow special pywb proxy script
for (const allowUrl of ALWAYS_ALLOW) {
if (url.startsWith(allowUrl)) {
return BlockState.ALLOW;
}
}
for (const rule of this.rules) {
const {done, block} = await this.ruleCheck(rule, request, url, isNavReq, logDetails);
if (block) {
if (blockState === BlockState.BLOCK_PAGE_NAV) {
logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
return BlockState.ALLOW;
}
logger.debug("URL Blocked in iframe", {url, ...logDetails}, "blocking");
await this.recordBlockMsg(url);
return blockState;
}
if (done) {
break;
}
}
return BlockState.ALLOW;
}
async ruleCheck(rule, request, reqUrl, isNavReq, logDetails) {
const {url, inFrameUrl, frameTextMatch} = rule;
let frameUrl = "";
const type = rule.type || "block";
const allowOnly = (type === "allowOnly");
// not a frame match, skip rule
if (inFrameUrl) {
//!frameUrl.match(inFrameUrl)
//return {block: false, done: false};
}
const urlMatched = (url && reqUrl.match(url));
// if frame text-based rule: if url matched and a frame request
// frame text-based match: only applies to nav requests, never block otherwise
if (frameTextMatch) {
if (!urlMatched || !isNavReq) {
return {block: false, done: false};
}
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
return {block, done: true};
}
// for non frame text rule, simply match by URL
const block = urlMatched ? !allowOnly : allowOnly;
return {block, done: false};
}
async isTextMatch(request, reqUrl, frameTextMatch, logDetails) {
try {
const res = await fetch(reqUrl);
const text = await res.text();
return !!text.match(frameTextMatch);
} catch (e) {
logger.debug("Error determining text match", {...errJSON(e), ...logDetails}, "blocking");
}
}
async recordBlockMsg(url) {
if (this.blockedUrlSet.has(url)) {
return;
}
this.blockedUrlSet.add(url);
if (!this.blockErrMsg || !this.blockPutUrl) {
return;
}
const body = this.blockErrMsg;
const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}
// ===========================================================================
export class AdBlockRules extends BlockRules
{
constructor(blockPutUrl, blockErrMsg, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
}
async initPage(page, cdp, browser) {
if (page._btrix_adInterceptionAdded) {
return true;
}
page._btrix_adInterceptionAdded = true;
//await page.route("**/*", (route) => {
await browser.addIntercept(cdp, (route) => {
const logDetails = {page: page.url()};
try {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
this.handleRequest(route, logDetails);
} catch (e) {
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
}
});
}
Remove puppeteer-cluster + iframe filtering + health check refactor + logging improvements (0.9.0-beta.0) (#219) * This commit removes puppeteer-cluster as a dependency in favor of a simpler concurrency implementation, using p-queue to limit concurrency to the number of available workers. As part of the refactor, the custom window concurrency model in windowconcur.js is removed and its logic implemented in the new Worker class's initPage method. * Remove concurrency models, always use new tab * logging improvements: include worker-id in logs, use 'worker' context - logging: log info string / version as first line - logging: improve logging of error stack traces - interruption: support interrupting crawl directly with 'interrupt' check which stops the job queue - interruption: don't repair if interrupting, wait for queue to be idle - log text extraction - init order: ensure wb-manager init called first, then logs created - logging: adjust info->debug logging - Log no jobs available as debug * tests: bail on first failure * iframe filtering: - fix filtering for about:blank iframes, support non-async shouldProcessFrame() - filter iframes both for behaviors and for link extraction - add 5-second timeout to link extraction, to avoid link extraction holding up crawl! - cache filtered frames * healthcheck/worker reuse: - refactor healthchecker into separate class - increment healthchecker (if provided) if new page load fails - remove expermeintal repair functionality for now - add healthcheck * deps: bump puppeteer-core to 17.1.2 - bump to 0.9.0-beta.0 -------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-03-08 21:31:19 -05:00
isAdUrl(url) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
Remove puppeteer-cluster + iframe filtering + health check refactor + logging improvements (0.9.0-beta.0) (#219) * This commit removes puppeteer-cluster as a dependency in favor of a simpler concurrency implementation, using p-queue to limit concurrency to the number of available workers. As part of the refactor, the custom window concurrency model in windowconcur.js is removed and its logic implemented in the new Worker class's initPage method. * Remove concurrency models, always use new tab * logging improvements: include worker-id in logs, use 'worker' context - logging: log info string / version as first line - logging: improve logging of error stack traces - interruption: support interrupting crawl directly with 'interrupt' check which stops the job queue - interruption: don't repair if interrupting, wait for queue to be idle - log text extraction - init order: ensure wb-manager init called first, then logs created - logging: adjust info->debug logging - Log no jobs available as debug * tests: bail on first failure * iframe filtering: - fix filtering for about:blank iframes, support non-async shouldProcessFrame() - filter iframes both for behaviors and for link extraction - add 5-second timeout to link extraction, to avoid link extraction holding up crawl! - cache filtered frames * healthcheck/worker reuse: - refactor healthchecker into separate class - increment healthchecker (if provided) if new page load fails - remove expermeintal repair functionality for now - add healthcheck * deps: bump puppeteer-core to 17.1.2 - bump to 0.9.0-beta.0 -------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-03-08 21:31:19 -05:00
return this.adhosts.includes(domain);
}
async shouldBlock(request, url, logDetails) {
if (this.isAdUrl(url)) {
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
await this.recordBlockMsg(url);
return BlockState.BLOCK_AD;
}
return BlockState.ALLOW;
}
}