From 5da379cb5f6989550af73e941027feb4287c4958 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 23 Feb 2023 18:50:22 -0800 Subject: [PATCH] Logging and Behavior Tweaks (#229) - Ensure page is included in all logging details - Update logging messages to be a single string, with variables added in the details - Always wait for all pending wait requests to finish (unless counter <0) - Don't set puppeteer-cluster timeout (prep for removing puppeeteer-cluster) - Add behaviorTimeout to running behaviors in crawler, in addition to in behaviors themselves. - Add logging for behavior start, finish and timeout - Move writeStats() logging to beginning of each page as well as at the end, to avoid confusion about pending pages. - For events from frames, use frameUrl along with current page - deps: bump browsertrix-behaviors to 0.4.2 - version: bump to 0.8.1 --- crawler.js | 121 ++++++++++++++++++++++++++++----------------- package.json | 4 +- util/blockrules.js | 43 ++++++++-------- util/browser.js | 18 +++---- util/state.js | 4 +- yarn.lock | 8 +-- 6 files changed, 116 insertions(+), 82 deletions(-) diff --git a/crawler.js b/crawler.js index 7ab2dfa4..928ce075 100644 --- a/crawler.js +++ b/crawler.js @@ -166,7 +166,7 @@ export class Crawler { this.logger.info(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state"); - this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname()); + this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname()); } else { this.logger.info("Storing state in memory", {}, "state"); @@ -324,21 +324,35 @@ export class Crawler { } } - _behaviorLog({data, type}) { + _behaviorLog({data, type}, pageUrl) { let behaviorLine; + let message; + let details; + + if (typeof(data) === "string") { + message = data; + details = {}; + } else { + message = type === "info" ? "Behavior log" : "Behavior debug"; + details = typeof(data) === "object" ? data : {}; + } + + if (pageUrl) { + details.page = pageUrl; + } switch (type) { case "info": behaviorLine = JSON.stringify(data); if (behaviorLine != this._behaviorLastLine) { - this.logger.info("Behavior log", data, "behavior"); + this.logger.info(message, details, "behaviorScript"); this._behaviorLastLine = behaviorLine; } break; case "debug": default: - this.logger.debug("Behavior debug", data, "behavior"); + this.logger.debug(message, details, "behaviorScript"); } } @@ -349,7 +363,12 @@ export class Crawler { } async crawlPage(opts) { + await this.writeStats(); + const {page, data} = opts; + const {url} = data; + + const logDetails = {page: url}; if (!this.isInScope(data)) { this.logger.info("Page no longer in scope", data); @@ -358,7 +377,7 @@ export class Crawler { try { if (this.screencaster) { - await this.screencaster.screencastTarget(page.target(), data.url); + await this.screencaster.screencastTarget(page.target(), url); } if (this.emulateDevice) { @@ -372,7 +391,7 @@ export class Crawler { await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});"); if (this.params.behaviorOpts && !page.__bx_inited) { - await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata)); + await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, url)); await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`); page.__bx_inited = true; } @@ -384,10 +403,10 @@ export class Crawler { if (this.params.screenshot) { if (!page.isHTMLPage) { - this.logger.info("Skipping screenshots for non-HTML page"); + this.logger.info("Skipping screenshots for non-HTML page", logDetails); } const archiveDir = path.join(this.collDir, "archive"); - const screenshots = new Screenshots({page, url: data.url, directory: archiveDir}); + const screenshots = new Screenshots({page, url, directory: archiveDir}); if (this.params.screenshot.includes("view")) { await screenshots.take(); } @@ -410,45 +429,58 @@ export class Crawler { if (this.params.behaviorOpts) { if (!page.isHTMLPage) { - this.logger.info("Skipping behaviors for non-HTML page", data, "behavior"); + this.logger.info("Skipping behaviors for non-HTML page", logDetails, "behavior"); } else { - await Promise.allSettled( - page.frames(). - filter(frame => this.shouldRunBehavior(frame)). - map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")) - ); - - // also wait for general net idle - await this.netIdle(page); + const behaviorTimeout = this.params.behaviorTimeout / 1000; + this.logger.info("Behaviors started", {behaviorTimeout, ...logDetails}, "behavior"); + const res = await Promise.race([ + this.sleep(behaviorTimeout), + Promise.allSettled( + page.frames(). + filter(frame => this.shouldRunBehavior(frame, logDetails)). + map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();", logDetails, "behavior")) + ) + ]); + if (res && res.length) { + this.logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior"); + } else { + this.logger.error("Behaviors timed out", logDetails, "behavior"); + } } } - this.logger.info("Page graph data for successfully crawled page", data, "pageGraph"); - - await this.writeStats(); + this.logger.info("Page finished", logDetails, "pageStatus"); await this.checkLimits(); await this.serializeConfig(); } catch (e) { - this.logger.error(`Error crawling page ${data.url}`, e); + this.logger.error("Page Errored", {...e, ...logDetails}, "pageStatus"); await this.markPageFailed(page); } } - async shouldRunBehavior(frame) { + async shouldRunBehavior(frame, logDetails) { if (!frame.parentFrame()) { return true; } - const url = frame.url(); + const frameUrl = frame.url(); - if (url === "about:blank") { - return false; + let res; + + if (frameUrl === "about:blank") { + res = false; + } else { + res = !(await this.adBlockRules.shouldBlock(null, frameUrl, logDetails)); } - return !!(await this.adBlockRules.shouldBlock(null, url)); + if (!res) { + this.logger.info("Skipping behavior for frame", {frameUrl, ...logDetails}, "behavior"); + } + + return res; } async createWARCInfo(filename) { @@ -591,7 +623,8 @@ export class Crawler { concurrency: this.params.newContext, maxConcurrency: this.params.workers, skipDuplicateUrls: false, - timeout: this.params.timeout * 2, + // effectively disable + timeout: 1e8, puppeteerOptions: this.puppeteerArgs, puppeteer, monitor: false @@ -645,6 +678,8 @@ export class Crawler { await this.pagesFH.close(); } + await this.writeStats(true); + // extra wait for all resources to land into WARCs await this.awaitPendingClear(); @@ -657,8 +692,6 @@ export class Crawler { await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd})); } - await this.writeStats(true); - if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) { await this.generateWACZ(); @@ -783,7 +816,7 @@ export class Crawler { "pendingPages": pendingList.map(x => JSON.stringify(x)) }; - this.logger.info("Crawl statistics", stats, "crawlState"); + this.logger.info("Crawl statistics", stats, "crawlStatus"); if (toFile && this.params.statsFilename) { try { @@ -797,6 +830,8 @@ export class Crawler { async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) { const {url, seedId, depth, extraHops = 0} = urlData; + const logDetails = {page: url}; + let isHTMLPage = true; if (!await this.isHTML(url)) { @@ -840,13 +875,14 @@ export class Crawler { try { await page.goto(url, gotoOpts); if (this.errorCount > 0) { - this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`); + this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, logDetails); this.errorCount = 0; } } catch (e) { let msg = e.message || ""; if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { - this.logger.error(`Load Error: ${url}: ${msg}`); + const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error"; + this.logger.error(mainMessage, {msg, ...logDetails}); this.errorCount++; } } @@ -859,11 +895,9 @@ export class Crawler { const seed = this.params.scopedSeeds[seedId]; - await this.checkCF(page); + await this.checkCF(page, logDetails); - await this.netIdle(page); - - await this.sleep(5); + await this.netIdle(page, logDetails); // skip extraction if at max depth if (seed.isAtMaxDepth(depth) || !selectorOptsList) { @@ -884,7 +918,7 @@ export class Crawler { } } - async netIdle(page) { + async netIdle(page, details) { if (!this.params.netIdleWait) { return; } @@ -895,7 +929,7 @@ export class Crawler { try { await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000}); } catch (e) { - this.logger.info("note: waitForNetworkIdle timed out, ignoring"); + this.logger.info("waitForNetworkIdle timed out, ignoring", details); // ignore, continue } } @@ -956,14 +990,14 @@ export class Crawler { } } - async checkCF(page) { + async checkCF(page, logDetails) { try { while (await page.$("div.cf-browser-verification.cf-im-under-attack")) { - this.logger.info("Cloudflare Check Detected, waiting for reload..."); + this.logger.info("Cloudflare Check Detected, waiting for reload...", logDetails); await this.sleep(5.5); } } catch (e) { - this.logger.warn("Check CF failed, ignoring", e); + //this.logger.warn("Check CF failed, ignoring"); } } @@ -1090,10 +1124,7 @@ export class Crawler { const redis = await initRedis("redis://localhost/0"); - // wait for pending requests upto 120 secs, unless canceling - const MAX_WAIT = 120; - - for (let i = 0; i < MAX_WAIT && !this.interrupted; i++) { + while (!this.interrupted) { try { const count = Number(await redis.get(`pywb:${this.params.collection}:pending`) || 0); if (count <= 0) { diff --git a/package.json b/package.json index 9893745a..388e0e9a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.8.0-beta.2", + "version": "0.8.1", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", @@ -13,7 +13,7 @@ "dependencies": { "@novnc/novnc": "1.4.0-beta", "abort-controller": "^3.0.0", - "browsertrix-behaviors": "^0.4.1", + "browsertrix-behaviors": "^0.4.2", "get-folder-size": "^4.0.0", "ioredis": "^4.27.1", "js-yaml": "^4.1.0", diff --git a/util/blockrules.js b/util/blockrules.js index 29aa2116..6085216c 100644 --- a/util/blockrules.js +++ b/util/blockrules.js @@ -64,9 +64,9 @@ export class BlockRules } if (this.rules.length) { - this.logger.debug("URL Block Rules:\n"); + this.logger.debug("URL Block Rules:\n", {}, "blocking"); for (const rule of this.rules) { - this.logger.debug(rule.toString()); + this.logger.debug(rule.toString(), {}, "blocking"); } } } @@ -85,21 +85,22 @@ export class BlockRules await page.setRequestInterception(true); page.on("request", async (request) => { + const logDetails = {page: page.url()}; try { - await this.handleRequest(request); + await this.handleRequest(request, logDetails); } catch (e) { - this.logger.warn("Error handling request", e); + this.logger.warn("Error handling request", {...e, ...logDetails}, "blocking"); } }); } - async handleRequest(request) { + async handleRequest(request, logDetails) { const url = request.url(); let blockState; try { - blockState = await this.shouldBlock(request, url); + blockState = await this.shouldBlock(request, url, logDetails); if (blockState === BlockState.ALLOW) { await request.continue(); @@ -108,11 +109,11 @@ export class BlockRules } } catch (e) { - this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, e); + this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...e, ...logDetails}, "blocking"); } } - async shouldBlock(request, url) { + async shouldBlock(request, url, logDetails) { if (!url.startsWith("http:") && !url.startsWith("https:")) { return BlockState.ALLOW; } @@ -151,14 +152,14 @@ export class BlockRules } for (const rule of this.rules) { - const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq); + const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails); if (block) { if (blockState === BlockState.BLOCK_PAGE_NAV) { - this.logger.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`); + this.logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking"); return BlockState.ALLOW; } - this.logger.debug(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`); + this.logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking"); await this.recordBlockMsg(url); return blockState; } @@ -170,7 +171,7 @@ export class BlockRules return BlockState.ALLOW; } - async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq) { + async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq, logDetails) { const {url, inFrameUrl, frameTextMatch} = rule; const type = rule.type || "block"; @@ -190,8 +191,8 @@ export class BlockRules return {block: false, done: false}; } - const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly; - this.logger.debug(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`); + const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly; + this.logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking"); return {block, done: true}; } @@ -200,7 +201,7 @@ export class BlockRules return {block, done: false}; } - async isTextMatch(request, reqUrl, frameTextMatch) { + async isTextMatch(request, reqUrl, frameTextMatch, logDetails) { try { const res = await fetch(reqUrl); const text = await res.text(); @@ -208,7 +209,7 @@ export class BlockRules return !!text.match(frameTextMatch); } catch (e) { - this.logger.debug("Error determining text match", e); + this.logger.debug("Error determining text match", {...e, ...logDetails}, "blocking"); } } @@ -249,19 +250,21 @@ export class AdBlockRules extends BlockRules await page.setRequestInterception(true); page.on("request", async (request) => { + const logDetails = {page: page.url()}; + try { - await this.handleRequest(request); + await this.handleRequest(request, logDetails); } catch (e) { - this.logger.warn("Error handling request", e); + this.logger.warn("Error handling request", {...e, ...logDetails}, "blocking"); } }); } - async shouldBlock(request, url) { + async shouldBlock(request, url, logDetails) { const fragments = url.split("/"); const domain = fragments.length > 2 ? fragments[2] : null; if (this.adhosts.includes(domain)) { - this.logger.debug(`URL blocked for being an ad: ${url}`); + this.logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking"); await this.recordBlockMsg(url); return BlockState.BLOCK_AD; } diff --git a/util/browser.js b/util/browser.js index 1c2287ba..fbfa7e45 100644 --- a/util/browser.js +++ b/util/browser.js @@ -143,11 +143,12 @@ export function chromeArgs (proxy, userAgent=null, extraArgs=[]) { } -export async function evaluateWithCLI(frame, funcString, name = "behaviors") { +export async function evaluateWithCLI(frame, funcString, logData, contextName) { const context = await frame.executionContext(); - const url = frame.url(); - logger.info(`Running ${name}...`, {url}); + let details = {frameUrl: frame.url(), ...logData}; + + logger.info("Run Script Started", details, contextName); // from puppeteer _evaluateInternal() but with includeCommandLineAPI: true const contextId = context._contextId; @@ -164,13 +165,12 @@ export async function evaluateWithCLI(frame, funcString, name = "behaviors") { }); if (exceptionDetails) { - const details = exceptionDetails.stackTrace || {}; - details.url = url; - logger.error( - `Run ${name} failed: ${exceptionDetails.text}`, details - ); + if (exceptionDetails.stackTrace) { + details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details}; + } + logger.error("Run Script Failed", details, contextName); } else { - logger.info(`Run ${name} finished`, {url}); + logger.info("Run Script Finished", details, contextName); } return result.value; diff --git a/util/state.js b/util/state.js index 05baee9e..44890a53 100644 --- a/util/state.js +++ b/util/state.js @@ -326,11 +326,11 @@ return 0; data = JSON.parse(json); } catch(e) { logger.error("Invalid queued json", json); - return null; + return undefined; } if (!data) { - return null; + return undefined; } const url = data.url; diff --git a/yarn.lock b/yarn.lock index b6f49f9f..019034e0 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1148,10 +1148,10 @@ browserslist@^4.21.3: node-releases "^2.0.6" update-browserslist-db "^1.0.9" -browsertrix-behaviors@^0.4.1: - version "0.4.1" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.1.tgz#dfa5beb66348df28ccbaad4ba66fe49ca0dde947" - integrity sha512-zG6eNWqT9z+WvdUSSWg6de8NtYnGs7MHpzezKUgMJg+ze1+xtBgc6jkLT1d64eXC0cZQgwW+WV/e0yLAi1PyPA== +browsertrix-behaviors@^0.4.2: + version "0.4.2" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.2.tgz#830f4e37ddebf10dd923237dfd75c734d5d211e9" + integrity sha512-5w6kPL3NB/BkmEGxWt3NT3iddAaSzMR1TtDPS7b66fM9kkhpCjCv/R/zR951jWDIeV3flJFBOy09uI5o8asPqg== bser@2.1.1: version "2.1.1"