Better check to see if ERR_ABORTED should be ignored. (#127)

* error abort check: Fix possible regression with req.failure() returning null, also move to separate function., wrap in exception handler
* bump version to 0.5.0-beta.6
This commit is contained in:
Ilya Kreymer 2022-03-14 14:41:39 -07:00 committed by GitHub
parent ab096cd5b0
commit 1fae21b0cf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 17 deletions

View file

@ -568,22 +568,7 @@ class Crawler {
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
// if so, don't report as an error
page.on("requestfailed", (req) => {
const failure = req.failure().errorText;
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
return;
}
const resp = req.response();
const headers = resp && resp.headers();
if (!headers) {
return;
}
if (headers["content-disposition"] ||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
ignoreAbort = true;
}
ignoreAbort = shouldIgnoreAbort(req);
});
try {
@ -978,4 +963,27 @@ class Crawler {
}
}
function shouldIgnoreAbort(req) {
try {
const failure = req.failure() && req.failure().errorText;
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
return false;
}
const resp = req.response();
const headers = resp && resp.headers();
if (!headers) {
return false;
}
if (headers["content-disposition"] ||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
return true;
}
} catch (e) {
return false;
}
}
module.exports.Crawler = Crawler;

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.5.0-beta.5",
"version": "0.5.0-beta.6",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",