2022-10-25 10:53:32 -04:00
|
|
|
import fs from "fs";
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
import { logger, errJSON } from "./logger.js";
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
const RULE_TYPES = ["block", "allowOnly"];
|
|
|
|
|
2021-07-27 09:41:21 -07:00
|
|
|
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
const BlockState = {
|
|
|
|
ALLOW: null,
|
|
|
|
BLOCK_PAGE_NAV: "page",
|
|
|
|
BLOCK_IFRAME_NAV: "iframe",
|
2022-10-25 10:53:32 -04:00
|
|
|
BLOCK_OTHER: "resource",
|
|
|
|
BLOCK_AD: "advertisement"
|
2021-08-17 20:54:18 -07:00
|
|
|
};
|
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
// ===========================================================================
|
|
|
|
class BlockRule
|
|
|
|
{
|
|
|
|
constructor(data) {
|
|
|
|
if (typeof(data) === "string") {
|
|
|
|
this.url = new RegExp(data);
|
|
|
|
this.type = "block";
|
|
|
|
} else {
|
|
|
|
this.url = data.url ? new RegExp(data.url) : null;
|
|
|
|
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
|
|
|
|
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
|
|
|
|
this.type = data.type || "block";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!RULE_TYPES.includes(this.type)) {
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
toString() {
|
|
|
|
return `\
|
|
|
|
* Rule for URL Regex: ${this.url}
|
|
|
|
Type: ${this.type}
|
|
|
|
In Frame Regex: ${this.inFrameUrl ? this.inFrameUrl : "any"}
|
|
|
|
Resource Type: ${this.frameTextMatch ? "frame" : "any"}
|
|
|
|
${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
|
|
|
`;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===========================================================================
|
2022-10-24 15:30:10 +02:00
|
|
|
export class BlockRules
|
2021-07-19 15:49:43 -07:00
|
|
|
{
|
2023-04-03 10:58:13 -07:00
|
|
|
constructor(blockRules, blockPutUrl, blockErrMsg) {
|
2021-07-19 15:49:43 -07:00
|
|
|
this.rules = [];
|
|
|
|
this.blockPutUrl = blockPutUrl;
|
|
|
|
this.blockErrMsg = blockErrMsg;
|
2021-08-17 20:54:18 -07:00
|
|
|
|
|
|
|
this.blockedUrlSet = new Set();
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
for (const ruleData of blockRules) {
|
|
|
|
this.rules.push(new BlockRule(ruleData));
|
|
|
|
}
|
|
|
|
|
2021-07-20 15:56:30 -07:00
|
|
|
if (this.rules.length) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.debug("URL Block Rules:\n", {}, "blocking");
|
2021-07-20 15:56:30 -07:00
|
|
|
for (const rule of this.rules) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.debug(rule.toString(), {}, "blocking");
|
2021-07-20 15:56:30 -07:00
|
|
|
}
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-25 10:17:16 -07:00
|
|
|
async initPage(page, cdp, browser) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
if (!this.rules.length) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
if (page._btrix_interceptionAdded) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
page._btrix_interceptionAdded = true;
|
|
|
|
|
2023-04-25 10:17:16 -07:00
|
|
|
//await page.route("**/*", (route) => {
|
|
|
|
await browser.addIntercept(cdp, (route) => {
|
2023-02-23 18:50:22 -08:00
|
|
|
const logDetails = {page: page.url()};
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
try {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.handleRequest(route, logDetails);
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
} catch (e) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
}
|
|
|
|
});
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
async handleRequest(route, logDetails) {
|
|
|
|
const request = route.request();
|
2021-07-19 15:49:43 -07:00
|
|
|
const url = request.url();
|
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
let blockState;
|
2021-07-19 15:49:43 -07:00
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
try {
|
2023-02-23 18:50:22 -08:00
|
|
|
blockState = await this.shouldBlock(request, url, logDetails);
|
2021-07-19 15:49:43 -07:00
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
if (blockState === BlockState.ALLOW) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await route.continue();
|
2021-08-17 20:54:18 -07:00
|
|
|
} else {
|
2023-04-25 10:17:16 -07:00
|
|
|
await route.abort("BlockedByClient");
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
} catch (e) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...errJSON(e), ...logDetails}, "blocking");
|
2021-08-17 20:54:18 -07:00
|
|
|
}
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
async shouldBlock(request, url, logDetails) {
|
2021-08-17 20:54:18 -07:00
|
|
|
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
|
|
|
return BlockState.ALLOW;
|
|
|
|
}
|
2021-07-19 15:49:43 -07:00
|
|
|
|
2021-07-27 09:41:21 -07:00
|
|
|
const isNavReq = request.isNavigationRequest();
|
|
|
|
|
2023-04-25 10:17:16 -07:00
|
|
|
//const frame = request.frame();
|
2021-07-27 09:41:21 -07:00
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
let frameUrl = "";
|
|
|
|
let blockState;
|
2021-07-27 09:41:21 -07:00
|
|
|
|
|
|
|
if (isNavReq) {
|
2023-04-25 10:17:16 -07:00
|
|
|
//const parentFrame = frame.parentFrame();
|
|
|
|
//if (parentFrame) {
|
|
|
|
//frameUrl = parentFrame.url();
|
|
|
|
//blockState = BlockState.BLOCK_IFRAME_NAV;
|
|
|
|
//} else {
|
|
|
|
//frameUrl = frame.url();
|
|
|
|
blockState = BlockState.BLOCK_PAGE_NAV;
|
|
|
|
//}
|
2021-07-27 09:41:21 -07:00
|
|
|
} else {
|
2023-04-25 10:17:16 -07:00
|
|
|
//frameUrl = frame ? frame.url() : "";
|
2021-08-17 20:54:18 -07:00
|
|
|
blockState = BlockState.BLOCK_OTHER;
|
2021-07-27 09:41:21 -07:00
|
|
|
}
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
// ignore initial page
|
2023-04-25 10:17:16 -07:00
|
|
|
// if (frameUrl === "about:blank") {
|
|
|
|
// return BlockState.ALLOW;
|
|
|
|
// }
|
2021-08-17 20:54:18 -07:00
|
|
|
|
|
|
|
// always allow special pywb proxy script
|
|
|
|
for (const allowUrl of ALWAYS_ALLOW) {
|
|
|
|
if (url.startsWith(allowUrl)) {
|
|
|
|
return BlockState.ALLOW;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const rule of this.rules) {
|
2023-04-25 10:17:16 -07:00
|
|
|
const {done, block} = await this.ruleCheck(rule, request, url, isNavReq, logDetails);
|
2021-08-17 20:54:18 -07:00
|
|
|
|
|
|
|
if (block) {
|
|
|
|
if (blockState === BlockState.BLOCK_PAGE_NAV) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
|
2021-08-17 20:54:18 -07:00
|
|
|
return BlockState.ALLOW;
|
|
|
|
}
|
2023-04-25 10:17:16 -07:00
|
|
|
logger.debug("URL Blocked in iframe", {url, ...logDetails}, "blocking");
|
2021-08-17 20:54:18 -07:00
|
|
|
await this.recordBlockMsg(url);
|
|
|
|
return blockState;
|
|
|
|
}
|
|
|
|
if (done) {
|
|
|
|
break;
|
|
|
|
}
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
return BlockState.ALLOW;
|
|
|
|
}
|
|
|
|
|
2023-04-25 10:17:16 -07:00
|
|
|
async ruleCheck(rule, request, reqUrl, isNavReq, logDetails) {
|
2021-08-17 20:54:18 -07:00
|
|
|
const {url, inFrameUrl, frameTextMatch} = rule;
|
2023-04-25 10:17:16 -07:00
|
|
|
let frameUrl = "";
|
2021-08-17 20:54:18 -07:00
|
|
|
|
|
|
|
const type = rule.type || "block";
|
|
|
|
const allowOnly = (type === "allowOnly");
|
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
// not a frame match, skip rule
|
2023-04-25 10:17:16 -07:00
|
|
|
if (inFrameUrl) {
|
|
|
|
//!frameUrl.match(inFrameUrl)
|
|
|
|
//return {block: false, done: false};
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
const urlMatched = (url && reqUrl.match(url));
|
|
|
|
|
|
|
|
// if frame text-based rule: if url matched and a frame request
|
|
|
|
// frame text-based match: only applies to nav requests, never block otherwise
|
|
|
|
if (frameTextMatch) {
|
2021-07-27 09:41:21 -07:00
|
|
|
if (!urlMatched || !isNavReq) {
|
2021-08-17 20:54:18 -07:00
|
|
|
return {block: false, done: false};
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
|
2021-08-17 20:54:18 -07:00
|
|
|
return {block, done: true};
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// for non frame text rule, simply match by URL
|
|
|
|
const block = urlMatched ? !allowOnly : allowOnly;
|
2021-08-17 20:54:18 -07:00
|
|
|
return {block, done: false};
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
async isTextMatch(request, reqUrl, frameTextMatch, logDetails) {
|
2021-07-19 15:49:43 -07:00
|
|
|
try {
|
|
|
|
const res = await fetch(reqUrl);
|
|
|
|
const text = await res.text();
|
|
|
|
|
|
|
|
return !!text.match(frameTextMatch);
|
|
|
|
|
|
|
|
} catch (e) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.debug("Error determining text match", {...errJSON(e), ...logDetails}, "blocking");
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
async recordBlockMsg(url) {
|
|
|
|
if (this.blockedUrlSet.has(url)) {
|
2021-07-19 15:49:43 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-08-17 20:54:18 -07:00
|
|
|
this.blockedUrlSet.add(url);
|
|
|
|
|
|
|
|
if (!this.blockErrMsg || !this.blockPutUrl) {
|
2021-07-19 15:49:43 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const body = this.blockErrMsg;
|
|
|
|
const putUrl = new URL(this.blockPutUrl);
|
|
|
|
putUrl.searchParams.set("url", url);
|
|
|
|
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
|
|
|
}
|
|
|
|
}
|
2022-10-25 10:53:32 -04:00
|
|
|
|
|
|
|
|
|
|
|
// ===========================================================================
|
|
|
|
export class AdBlockRules extends BlockRules
|
|
|
|
{
|
2023-04-03 10:58:13 -07:00
|
|
|
constructor(blockPutUrl, blockErrMsg, adhostsFilePath = "../ad-hosts.json") {
|
|
|
|
super([], blockPutUrl, blockErrMsg);
|
2022-10-25 10:53:32 -04:00
|
|
|
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
|
|
|
|
}
|
|
|
|
|
2023-04-25 10:17:16 -07:00
|
|
|
async initPage(page, cdp, browser) {
|
2022-10-25 10:53:32 -04:00
|
|
|
if (page._btrix_adInterceptionAdded) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
page._btrix_adInterceptionAdded = true;
|
|
|
|
|
2023-04-25 10:17:16 -07:00
|
|
|
//await page.route("**/*", (route) => {
|
|
|
|
await browser.addIntercept(cdp, (route) => {
|
2023-02-23 18:50:22 -08:00
|
|
|
const logDetails = {page: page.url()};
|
2022-10-25 10:53:32 -04:00
|
|
|
try {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.handleRequest(route, logDetails);
|
2022-10-25 10:53:32 -04:00
|
|
|
} catch (e) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
|
2022-10-25 10:53:32 -04:00
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
isAdUrl(url) {
|
2022-10-25 10:53:32 -04:00
|
|
|
const fragments = url.split("/");
|
|
|
|
const domain = fragments.length > 2 ? fragments[2] : null;
|
2023-03-08 21:31:19 -05:00
|
|
|
return this.adhosts.includes(domain);
|
|
|
|
}
|
|
|
|
|
|
|
|
async shouldBlock(request, url, logDetails) {
|
|
|
|
if (this.isAdUrl(url)) {
|
2023-04-03 10:58:13 -07:00
|
|
|
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
|
2022-10-25 10:53:32 -04:00
|
|
|
await this.recordBlockMsg(url);
|
|
|
|
return BlockState.BLOCK_AD;
|
|
|
|
}
|
|
|
|
return BlockState.ALLOW;
|
|
|
|
}
|
|
|
|
}
|