mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Better check to see if ERR_ABORTED should be ignored. (#127)
* error abort check: Fix possible regression with req.failure() returning null, also move to separate function., wrap in exception handler * bump version to 0.5.0-beta.6
This commit is contained in:
parent
ab096cd5b0
commit
1fae21b0cf
2 changed files with 25 additions and 17 deletions
40
crawler.js
40
crawler.js
|
@ -568,22 +568,7 @@ class Crawler {
|
||||||
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
|
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
|
||||||
// if so, don't report as an error
|
// if so, don't report as an error
|
||||||
page.on("requestfailed", (req) => {
|
page.on("requestfailed", (req) => {
|
||||||
const failure = req.failure().errorText;
|
ignoreAbort = shouldIgnoreAbort(req);
|
||||||
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const resp = req.response();
|
|
||||||
const headers = resp && resp.headers();
|
|
||||||
|
|
||||||
if (!headers) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (headers["content-disposition"] ||
|
|
||||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
|
||||||
ignoreAbort = true;
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -978,4 +963,27 @@ class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function shouldIgnoreAbort(req) {
|
||||||
|
try {
|
||||||
|
const failure = req.failure() && req.failure().errorText;
|
||||||
|
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const resp = req.response();
|
||||||
|
const headers = resp && resp.headers();
|
||||||
|
|
||||||
|
if (!headers) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (headers["content-disposition"] ||
|
||||||
|
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
module.exports.Crawler = Crawler;
|
module.exports.Crawler = Crawler;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.5.0-beta.5",
|
"version": "0.5.0-beta.6",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue