From 416a62fb7a52c7d7faa3db4e317c5db817b2d868 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Thu, 31 Aug 2023 11:47:49 -0700
Subject: [PATCH] improve logging / healthcheck responsiveness: - avoid double
 logging of exceptions - ensure page is included in all errors - treat some
 timeouts as warnings - remove unused responseHeaders call in browser - fix
 screenshot err logging

healthcheck:
- interrupt crawl directly after healthcheck fails
---
 crawler.js          | 168 +++++++++++++++++++++++++-------------------
 util/browser.js     |   4 --
 util/healthcheck.js |  13 +++-
 util/screenshots.js |   8 +--
 util/state.js       |  33 +++++----
 util/timing.js      |   5 +-
 util/worker.js      |   6 +-
 7 files changed, 140 insertions(+), 97 deletions(-)

diff --git a/crawler.js b/crawler.js
index 493faa14..431757df 100644
--- a/crawler.js
+++ b/crawler.js
@@ -546,6 +546,10 @@ self.__bx_behaviors.selectMainBehavior();
       if (this.healthChecker) {
         this.healthChecker.incError();
       }
+      this.interrupted = this.healthChecker.isFailing();
+      if (this.interrupted) {
+        logger.warn("Interrupting due to too many failures, will restart");
+      }
     }
 
     await this.serializeConfig();
@@ -992,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior();
           logger.debug(stdout.join("\n"));
         }
         if (stderr.length && this.params.logging.includes("debug")) {
-          logger.error(stderr.join("\n"));
+          logger.debug(stderr.join("\n"));
         }
         resolve(code);
       });
@@ -1046,20 +1050,24 @@ self.__bx_behaviors.selectMainBehavior();
 
     const failCrawlOnError = ((depth === 0) && this.params.failOnFailedSeed);
 
-    let isHTMLPage = await timedRun(
+    data.isHTMLPage = await timedRun(
       this.isHTML(url),
       FETCH_TIMEOUT_SECS,
       "HEAD request to determine if URL is HTML page timed out",
-      logDetails
+      logDetails,
+      "fetch",
+      true
     );
 
-    if (!isHTMLPage) {
+    if (!data.isHTMLPage) {
       try {
         const captureResult = await timedRun(
           this.directFetchCapture(url),
           FETCH_TIMEOUT_SECS,
           "Direct fetch capture attempt timed out",
-          logDetails
+          logDetails,
+          "fetch",
+          true
         );
         if (captureResult) {
           logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
@@ -1070,73 +1078,11 @@ self.__bx_behaviors.selectMainBehavior();
       }
     }
 
-    let ignoreAbort = false;
-
-    // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
-    // if so, don't report as an error
-    page.once("requestfailed", (req) => {
-      ignoreAbort = shouldIgnoreAbort(req);
-    });
-
-    if (isHTMLPage) {
-      page.once("domcontentloaded", () => {
-        data.loadState = LoadState.CONTENT_LOADED;
-      });
-    }
-
-    const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
-
-    logger.info("Awaiting page load", logDetails);
-
-    try {
-      const resp = await page.goto(url, gotoOpts);
-
-      // Handle 4xx or 5xx response as a page load error
-      const statusCode = resp.status();
-      if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
-        if (failCrawlOnError) {
-          logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
-        } else {
-          logger.error("Page Load Error, skipping page", {statusCode, ...logDetails});
-          throw new Error(`Page ${url} returned status code ${statusCode}`);
-        }
-      }
-
-      const contentType = await this.browser.responseHeader(resp, "content-type");
-
-      isHTMLPage = this.isHTMLContentType(contentType);
-
-    } catch (e) {
-      let msg = e.message || "";
-      if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
-        if (e.name === "TimeoutError") {
-          if (data.loadState !== LoadState.CONTENT_LOADED) {
-            if (failCrawlOnError) {
-              logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
-            } else {
-              logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
-              throw e;
-            }
-          } else {
-            logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
-            data.skipBehaviors = true;
-          }
-        } else {
-          if (failCrawlOnError) {
-            logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
-          } else {
-            logger.error("Page Load Error, skipping page", {msg, ...logDetails});
-            throw e;
-          }
-        }
-      }
-    }
+    await this.doGotoPage(page, url, data, logDetails, failCrawlOnError);
 
     data.loadState = LoadState.FULL_PAGE_LOADED;
 
-    data.isHTMLPage = isHTMLPage;
-
-    if (isHTMLPage) {
+    if (data.isHTMLPage) {
       let frames = await page.frames();
       frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)));
 
@@ -1147,7 +1093,7 @@ self.__bx_behaviors.selectMainBehavior();
       data.filteredFrames = [];
     }
 
-    if (!isHTMLPage) {
+    if (data.isHTMLPage) {
       logger.debug("Skipping link extraction for non-HTML page", logDetails);
       return;
     }
@@ -1172,6 +1118,82 @@ self.__bx_behaviors.selectMainBehavior();
     }
   }
 
+  async doGotoPage(page, url, data, logDetails, failCrawlOnError) {
+    let ignoreAbort = false;
+
+    // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
+    // if so, don't report as an error
+    page.once("requestfailed", (req) => {
+      ignoreAbort = shouldIgnoreAbort(req);
+    });
+
+    if (data.isHTMLPage) {
+      page.once("domcontentloaded", () => {
+        data.loadState = LoadState.CONTENT_LOADED;
+      });
+    }
+
+    const gotoOpts = data.isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
+
+    logger.info("Awaiting page load", logDetails);
+
+    let mainPageResp = null;
+
+    try {
+      mainPageResp = await page.goto(url, gotoOpts);
+
+    } catch (e) {
+      let msg = e.message || "";
+      if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
+        if (e.name === "TimeoutError") {
+          if (data.loadState !== LoadState.CONTENT_LOADED) {
+            if (failCrawlOnError) {
+              logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
+            } else {
+              logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
+              e.message = "logged";
+              throw e;
+            }
+          } else {
+            logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
+            data.skipBehaviors = true;
+          }
+        } else {
+          if (failCrawlOnError) {
+            logger.fatal("Seed Page Load Error, failing crawl", {msg, ...logDetails});
+          } else {
+            logger.error("Page Load Error, skipping page", {msg, ...logDetails});
+            e.message = "logged";
+            throw e;
+          }
+        }
+      }
+    }
+
+    if (!mainPageResp) {
+      return;
+    }
+
+    // Handle 4xx or 5xx response as a page load error
+    const statusCode = mainPageResp.status();
+
+    if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
+      if (failCrawlOnError) {
+        logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
+      } else {
+        logger.error("Invalid status Code, skipping page", {statusCode, ...logDetails});
+        throw new Error("logged");
+      }
+    }
+
+    // attempt to determine content-type more accurately, ignore if failed
+    try {
+      data.isHTMLPage = this.isHTMLContentType(mainPageResp.headers()["content-type"]);
+    } catch (e) {
+      // ignore
+    }
+  }
+
   async netIdle(page, details) {
     if (!this.params.netIdleWait) {
       return;
@@ -1267,7 +1289,11 @@ self.__bx_behaviors.selectMainBehavior();
 
       while (await timedRun(
         page.$("div.cf-browser-verification.cf-im-under-attack"),
-        PAGE_OP_TIMEOUT_SECS
+        PAGE_OP_TIMEOUT_SECS,
+        "Cloudflare check timed out",
+        logDetails,
+        "general",
+        true
       )) {
         logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
         await sleep(5.5);
diff --git a/util/browser.js b/util/browser.js
index b5f628e8..806c0f17 100644
--- a/util/browser.js
+++ b/util/browser.js
@@ -280,10 +280,6 @@ export class Browser extends BaseBrowser
     return {page, cdp};
   }
 
-  async responseHeader(resp, header) {
-    return await resp.headers()[header];
-  }
-
   async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) {
     const context = await frame.executionContext();
     cdp = context._client;
diff --git a/util/healthcheck.js b/util/healthcheck.js
index 36506ffc..f386f031 100644
--- a/util/healthcheck.js
+++ b/util/healthcheck.js
@@ -10,17 +10,22 @@ export class HealthChecker
     this.port = port;
     this.errorCount = 0;
     this.errorThreshold = errorThreshold;
+    this.resetCount = 0;
 
     this.healthServer = http.createServer((...args) => this.healthCheck(...args));
     logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
     this.healthServer.listen(port);
   }
 
+  isFailing() {
+    return this.errorCount >= this.errorThreshold;
+  }
+
   async healthCheck(req, res) {
     const pathname = url.parse(req.url).pathname;
     switch (pathname) {
     case "/healthz":
-      if (this.errorCount < this.errorThreshold) {
+      if (!this.isFailing()) {
         logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck");
         res.writeHead(200);
         res.end();
@@ -34,9 +39,13 @@ export class HealthChecker
   }
 
   resetErrors() {
-    if (this.errorCount > 0) {
+    this.resetCount++;
+    if (this.errorCount > 0 && this.resetCount >= 2) {
       logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck");
       this.errorCount = 0;
+      this.resetCount = 0;
+    } else if (!this.errorCount) {
+      this.resetCount = 0;
     }
   }
 
diff --git a/util/screenshots.js b/util/screenshots.js
index 26f64a09..44f24ec6 100644
--- a/util/screenshots.js
+++ b/util/screenshots.js
@@ -3,7 +3,7 @@ import path from "path";
 import * as warcio from "warcio";
 import sharp from "sharp";
 
-import { logger } from "./logger.js";
+import { logger, errJSON } from "./logger.js";
 
 // ============================================================================
 
@@ -47,7 +47,7 @@ export class Screenshots {
       await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
       logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
     } catch (e) {
-      logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
+      logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
     }
   }
 
@@ -56,8 +56,8 @@ export class Screenshots {
   }
 
   async takeThumbnail() {
+    const screenshotType = "thumbnail";
     try {
-      const screenshotType = "thumbnail";
       await this.browser.setViewport(this.page, {width: 1920, height: 1080});
       const options = screenshotTypes[screenshotType];
       const screenshotBuffer = await this.page.screenshot(options);
@@ -68,7 +68,7 @@ export class Screenshots {
       await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
       logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
     } catch (e) {
-      logger.error(`Taking screenshot (type: thumbnail) failed for ${this.url}`, e.message);
+      logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
     }
   }
 
diff --git a/util/state.js b/util/state.js
index 0cfe4547..38960a0d 100644
--- a/util/state.js
+++ b/util/state.js
@@ -134,7 +134,7 @@ end
     });
 
     redis.defineCommand("requeue", {
-      numberOfKeys: 3,
+      numberOfKeys: 4,
       lua: `
 local res = redis.call('get', KEYS[3]);
 if not res then
@@ -148,6 +148,9 @@ if not res then
       redis.call('zadd', KEYS[2], 0, json);
       return 1;
     else
+      data['failed'] = '1';
+      json = cjson.encode(data);
+      redis.call('lpush', KEYS[3], json);
       return 2;
     end
   end
@@ -179,7 +182,8 @@ return 0;
   }
 
   async markFailed(url) {
-    await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
+    //await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
+    await this.requeueOrFail(url);
 
     return await this.redis.incr(this.dkey);
   }
@@ -416,19 +420,24 @@ return 0;
     const pendingUrls = await this.redis.hkeys(this.pkey);
 
     for (const url of pendingUrls) {
-      const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending);
-      switch (res) {
-      case 1:
-        logger.info(`Requeued: ${url}`);
-        break;
-
-      case 2:
-        logger.info(`Not requeuing anymore: ${url}`);
-        break;
-      }
+      await this.requeueOrFail(url);
     }
   }
 
+  async requeueOrFail(url) {
+    const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, this.fkey, url, this.maxRetryPending);
+    switch (res) {
+    case 1:
+      logger.info(`Requeued: ${url}`);
+      break;
+
+    case 2:
+      logger.info(`Not requeuing anymore: ${url}`);
+      break;
+    }
+    return res === 1;
+  }
+
   async queueSize() {
     this._lastSize = await this.redis.zcard(this.qkey);
     return this._lastSize;
diff --git a/util/timing.js b/util/timing.js
index 926d0402..b0f63c2d 100644
--- a/util/timing.js
+++ b/util/timing.js
@@ -4,7 +4,7 @@ export function sleep(seconds) {
   return new Promise(resolve => setTimeout(resolve, seconds * 1000));
 }
 
-export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general") {
+export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
   // return Promise return value or log error if timeout is reached first
   const timeout = seconds * 1000;
 
@@ -17,7 +17,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
   return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
     .catch((err) =>  {
       if (err == "timeout reached") {
-        logger.error(message, {"seconds": seconds, ...logDetails}, context);
+        const logFunc = isWarn ? logger.warn : logger.error;
+        logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
       } else {
         //logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
         throw err;
diff --git a/util/worker.js b/util/worker.js
index 2f61b754..6117956e 100644
--- a/util/worker.js
+++ b/util/worker.js
@@ -167,14 +167,16 @@ export class PageWorker
           this.crawler.crawlPage(opts),
           this.maxPageTime,
           "Page Worker Timeout",
-          {workerid},
+          this.logDetails,
           "worker"
         ),
         this.crashBreak
       ]);
 
     } catch (e) {
-      logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
+      if (e.message !== "logged") {
+        logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
+      }
     } finally {
       await this.crawler.pageFinished(data);
     }