From 416a62fb7a52c7d7faa3db4e317c5db817b2d868 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Aug 2023 11:47:49 -0700 Subject: [PATCH] improve logging / healthcheck responsiveness: - avoid double logging of exceptions - ensure page is included in all errors - treat some timeouts as warnings - remove unused responseHeaders call in browser - fix screenshot err logging healthcheck: - interrupt crawl directly after healthcheck fails --- crawler.js | 168 +++++++++++++++++++++++++------------------- util/browser.js | 4 -- util/healthcheck.js | 13 +++- util/screenshots.js | 8 +-- util/state.js | 33 +++++---- util/timing.js | 5 +- util/worker.js | 6 +- 7 files changed, 140 insertions(+), 97 deletions(-) diff --git a/crawler.js b/crawler.js index 493faa14..431757df 100644 --- a/crawler.js +++ b/crawler.js @@ -546,6 +546,10 @@ self.__bx_behaviors.selectMainBehavior(); if (this.healthChecker) { this.healthChecker.incError(); } + this.interrupted = this.healthChecker.isFailing(); + if (this.interrupted) { + logger.warn("Interrupting due to too many failures, will restart"); + } } await this.serializeConfig(); @@ -992,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior(); logger.debug(stdout.join("\n")); } if (stderr.length && this.params.logging.includes("debug")) { - logger.error(stderr.join("\n")); + logger.debug(stderr.join("\n")); } resolve(code); }); @@ -1046,20 +1050,24 @@ self.__bx_behaviors.selectMainBehavior(); const failCrawlOnError = ((depth === 0) && this.params.failOnFailedSeed); - let isHTMLPage = await timedRun( + data.isHTMLPage = await timedRun( this.isHTML(url), FETCH_TIMEOUT_SECS, "HEAD request to determine if URL is HTML page timed out", - logDetails + logDetails, + "fetch", + true ); - if (!isHTMLPage) { + if (!data.isHTMLPage) { try { const captureResult = await timedRun( this.directFetchCapture(url), FETCH_TIMEOUT_SECS, "Direct fetch capture attempt timed out", - logDetails + logDetails, + "fetch", + true ); if (captureResult) { logger.info("Direct fetch successful", {url, ...logDetails}, "fetch"); @@ -1070,73 +1078,11 @@ self.__bx_behaviors.selectMainBehavior(); } } - let ignoreAbort = false; - - // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF), - // if so, don't report as an error - page.once("requestfailed", (req) => { - ignoreAbort = shouldIgnoreAbort(req); - }); - - if (isHTMLPage) { - page.once("domcontentloaded", () => { - data.loadState = LoadState.CONTENT_LOADED; - }); - } - - const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"}; - - logger.info("Awaiting page load", logDetails); - - try { - const resp = await page.goto(url, gotoOpts); - - // Handle 4xx or 5xx response as a page load error - const statusCode = resp.status(); - if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) { - if (failCrawlOnError) { - logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails}); - } else { - logger.error("Page Load Error, skipping page", {statusCode, ...logDetails}); - throw new Error(`Page ${url} returned status code ${statusCode}`); - } - } - - const contentType = await this.browser.responseHeader(resp, "content-type"); - - isHTMLPage = this.isHTMLContentType(contentType); - - } catch (e) { - let msg = e.message || ""; - if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { - if (e.name === "TimeoutError") { - if (data.loadState !== LoadState.CONTENT_LOADED) { - if (failCrawlOnError) { - logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails}); - } else { - logger.error("Page Load Timeout, skipping page", {msg, ...logDetails}); - throw e; - } - } else { - logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails}); - data.skipBehaviors = true; - } - } else { - if (failCrawlOnError) { - logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails}); - } else { - logger.error("Page Load Error, skipping page", {msg, ...logDetails}); - throw e; - } - } - } - } + await this.doGotoPage(page, url, data, logDetails, failCrawlOnError); data.loadState = LoadState.FULL_PAGE_LOADED; - data.isHTMLPage = isHTMLPage; - - if (isHTMLPage) { + if (data.isHTMLPage) { let frames = await page.frames(); frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails))); @@ -1147,7 +1093,7 @@ self.__bx_behaviors.selectMainBehavior(); data.filteredFrames = []; } - if (!isHTMLPage) { + if (data.isHTMLPage) { logger.debug("Skipping link extraction for non-HTML page", logDetails); return; } @@ -1172,6 +1118,82 @@ self.__bx_behaviors.selectMainBehavior(); } } + async doGotoPage(page, url, data, logDetails, failCrawlOnError) { + let ignoreAbort = false; + + // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF), + // if so, don't report as an error + page.once("requestfailed", (req) => { + ignoreAbort = shouldIgnoreAbort(req); + }); + + if (data.isHTMLPage) { + page.once("domcontentloaded", () => { + data.loadState = LoadState.CONTENT_LOADED; + }); + } + + const gotoOpts = data.isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"}; + + logger.info("Awaiting page load", logDetails); + + let mainPageResp = null; + + try { + mainPageResp = await page.goto(url, gotoOpts); + + } catch (e) { + let msg = e.message || ""; + if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { + if (e.name === "TimeoutError") { + if (data.loadState !== LoadState.CONTENT_LOADED) { + if (failCrawlOnError) { + logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails}); + } else { + logger.error("Page Load Timeout, skipping page", {msg, ...logDetails}); + e.message = "logged"; + throw e; + } + } else { + logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails}); + data.skipBehaviors = true; + } + } else { + if (failCrawlOnError) { + logger.fatal("Seed Page Load Error, failing crawl", {msg, ...logDetails}); + } else { + logger.error("Page Load Error, skipping page", {msg, ...logDetails}); + e.message = "logged"; + throw e; + } + } + } + } + + if (!mainPageResp) { + return; + } + + // Handle 4xx or 5xx response as a page load error + const statusCode = mainPageResp.status(); + + if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) { + if (failCrawlOnError) { + logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails}); + } else { + logger.error("Invalid status Code, skipping page", {statusCode, ...logDetails}); + throw new Error("logged"); + } + } + + // attempt to determine content-type more accurately, ignore if failed + try { + data.isHTMLPage = this.isHTMLContentType(mainPageResp.headers()["content-type"]); + } catch (e) { + // ignore + } + } + async netIdle(page, details) { if (!this.params.netIdleWait) { return; @@ -1267,7 +1289,11 @@ self.__bx_behaviors.selectMainBehavior(); while (await timedRun( page.$("div.cf-browser-verification.cf-im-under-attack"), - PAGE_OP_TIMEOUT_SECS + PAGE_OP_TIMEOUT_SECS, + "Cloudflare check timed out", + logDetails, + "general", + true )) { logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails); await sleep(5.5); diff --git a/util/browser.js b/util/browser.js index b5f628e8..806c0f17 100644 --- a/util/browser.js +++ b/util/browser.js @@ -280,10 +280,6 @@ export class Browser extends BaseBrowser return {page, cdp}; } - async responseHeader(resp, header) { - return await resp.headers()[header]; - } - async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) { const context = await frame.executionContext(); cdp = context._client; diff --git a/util/healthcheck.js b/util/healthcheck.js index 36506ffc..f386f031 100644 --- a/util/healthcheck.js +++ b/util/healthcheck.js @@ -10,17 +10,22 @@ export class HealthChecker this.port = port; this.errorCount = 0; this.errorThreshold = errorThreshold; + this.resetCount = 0; this.healthServer = http.createServer((...args) => this.healthCheck(...args)); logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck"); this.healthServer.listen(port); } + isFailing() { + return this.errorCount >= this.errorThreshold; + } + async healthCheck(req, res) { const pathname = url.parse(req.url).pathname; switch (pathname) { case "/healthz": - if (this.errorCount < this.errorThreshold) { + if (!this.isFailing()) { logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck"); res.writeHead(200); res.end(); @@ -34,9 +39,13 @@ export class HealthChecker } resetErrors() { - if (this.errorCount > 0) { + this.resetCount++; + if (this.errorCount > 0 && this.resetCount >= 2) { logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck"); this.errorCount = 0; + this.resetCount = 0; + } else if (!this.errorCount) { + this.resetCount = 0; } } diff --git a/util/screenshots.js b/util/screenshots.js index 26f64a09..44f24ec6 100644 --- a/util/screenshots.js +++ b/util/screenshots.js @@ -3,7 +3,7 @@ import path from "path"; import * as warcio from "warcio"; import sharp from "sharp"; -import { logger } from "./logger.js"; +import { logger, errJSON } from "./logger.js"; // ============================================================================ @@ -47,7 +47,7 @@ export class Screenshots { await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type); logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`); } catch (e) { - logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message); + logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots"); } } @@ -56,8 +56,8 @@ export class Screenshots { } async takeThumbnail() { + const screenshotType = "thumbnail"; try { - const screenshotType = "thumbnail"; await this.browser.setViewport(this.page, {width: 1920, height: 1080}); const options = screenshotTypes[screenshotType]; const screenshotBuffer = await this.page.screenshot(options); @@ -68,7 +68,7 @@ export class Screenshots { await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type); logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`); } catch (e) { - logger.error(`Taking screenshot (type: thumbnail) failed for ${this.url}`, e.message); + logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots"); } } diff --git a/util/state.js b/util/state.js index 0cfe4547..38960a0d 100644 --- a/util/state.js +++ b/util/state.js @@ -134,7 +134,7 @@ end }); redis.defineCommand("requeue", { - numberOfKeys: 3, + numberOfKeys: 4, lua: ` local res = redis.call('get', KEYS[3]); if not res then @@ -148,6 +148,9 @@ if not res then redis.call('zadd', KEYS[2], 0, json); return 1; else + data['failed'] = '1'; + json = cjson.encode(data); + redis.call('lpush', KEYS[3], json); return 2; end end @@ -179,7 +182,8 @@ return 0; } async markFailed(url) { - await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed"); + //await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed"); + await this.requeueOrFail(url); return await this.redis.incr(this.dkey); } @@ -416,19 +420,24 @@ return 0; const pendingUrls = await this.redis.hkeys(this.pkey); for (const url of pendingUrls) { - const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending); - switch (res) { - case 1: - logger.info(`Requeued: ${url}`); - break; - - case 2: - logger.info(`Not requeuing anymore: ${url}`); - break; - } + await this.requeueOrFail(url); } } + async requeueOrFail(url) { + const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, this.fkey, url, this.maxRetryPending); + switch (res) { + case 1: + logger.info(`Requeued: ${url}`); + break; + + case 2: + logger.info(`Not requeuing anymore: ${url}`); + break; + } + return res === 1; + } + async queueSize() { this._lastSize = await this.redis.zcard(this.qkey); return this._lastSize; diff --git a/util/timing.js b/util/timing.js index 926d0402..b0f63c2d 100644 --- a/util/timing.js +++ b/util/timing.js @@ -4,7 +4,7 @@ export function sleep(seconds) { return new Promise(resolve => setTimeout(resolve, seconds * 1000)); } -export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general") { +export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) { // return Promise return value or log error if timeout is reached first const timeout = seconds * 1000; @@ -17,7 +17,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai return Promise.race([promise, rejectPromiseOnTimeout(timeout)]) .catch((err) => { if (err == "timeout reached") { - logger.error(message, {"seconds": seconds, ...logDetails}, context); + const logFunc = isWarn ? logger.warn : logger.error; + logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context); } else { //logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context); throw err; diff --git a/util/worker.js b/util/worker.js index 2f61b754..6117956e 100644 --- a/util/worker.js +++ b/util/worker.js @@ -167,14 +167,16 @@ export class PageWorker this.crawler.crawlPage(opts), this.maxPageTime, "Page Worker Timeout", - {workerid}, + this.logDetails, "worker" ), this.crashBreak ]); } catch (e) { - logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker"); + if (e.message !== "logged") { + logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker"); + } } finally { await this.crawler.pageFinished(data); }