mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Better check to see if ERR_ABORTED should be ignored. (#127)
* error abort check: Fix possible regression with req.failure() returning null, also move to separate function., wrap in exception handler * bump version to 0.5.0-beta.6
This commit is contained in:
parent
ab096cd5b0
commit
1fae21b0cf
2 changed files with 25 additions and 17 deletions
40
crawler.js
40
crawler.js
|
@ -568,22 +568,7 @@ class Crawler {
|
|||
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
|
||||
// if so, don't report as an error
|
||||
page.on("requestfailed", (req) => {
|
||||
const failure = req.failure().errorText;
|
||||
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
||||
return;
|
||||
}
|
||||
|
||||
const resp = req.response();
|
||||
const headers = resp && resp.headers();
|
||||
|
||||
if (!headers) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (headers["content-disposition"] ||
|
||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
||||
ignoreAbort = true;
|
||||
}
|
||||
ignoreAbort = shouldIgnoreAbort(req);
|
||||
});
|
||||
|
||||
try {
|
||||
|
@ -978,4 +963,27 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
function shouldIgnoreAbort(req) {
|
||||
try {
|
||||
const failure = req.failure() && req.failure().errorText;
|
||||
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
||||
return false;
|
||||
}
|
||||
|
||||
const resp = req.response();
|
||||
const headers = resp && resp.headers();
|
||||
|
||||
if (!headers) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (headers["content-disposition"] ||
|
||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
||||
return true;
|
||||
}
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.Crawler = Crawler;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.5.0-beta.5",
|
||||
"version": "0.5.0-beta.6",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue