Better check to see if ERR_ABORTED should be ignored. (#127)

* error abort check: Fix possible regression with req.failure() returning null, also move to separate function., wrap in exception handler * bump version to 0.5.0-beta.6
2025-10-19 06:23:16 +00:00 · 2022-03-14 14:41:39 -07:00 · 2022-03-14 14:41:39 -07:00 · 1fae21b0cf
commit 1fae21b0cf
parent ab096cd5b0
2 changed files with 25 additions and 17 deletions
--- a/crawler.js
+++ b/crawler.js
@ -568,22 +568,7 @@ class Crawler {
    // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
    // if so, don't report as an error
    page.on("requestfailed", (req) => {
-      const failure = req.failure().errorText;
-      if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
-        return;
-      }
-
-      const resp = req.response();
-      const headers = resp && resp.headers();
-
-      if (!headers) {
-        return;
-      }
-
-      if (headers["content-disposition"] || 
-         (headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
-        ignoreAbort = true;
-      }
+      ignoreAbort = shouldIgnoreAbort(req);
    });

    try {
@ -978,4 +963,27 @@ class Crawler {
  }
 }

+function shouldIgnoreAbort(req) {
+  try {
+    const failure = req.failure() && req.failure().errorText;
+    if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
+      return false;
+    }
+
+    const resp = req.response();
+    const headers = resp && resp.headers();
+
+    if (!headers) {
+      return false;
+    }
+
+    if (headers["content-disposition"] || 
+       (headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
+      return true;
+    }
+  } catch (e) {
+    return false;
+  }
+}
+
 module.exports.Crawler = Crawler;
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.5.0-beta.5",
+  "version": "0.5.0-beta.6",
  "main": "browsertrix-crawler",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
  "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",