mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
handle 404 / other error code with no response:
- chrome returns net::ERR_HTTP_RESPONSE_CODE_FAILURE - store WARC record with empty response - don't retry page, save with loadState: 1 - fixes #789
This commit is contained in:
parent
5fedde6eee
commit
4522f42f4e
3 changed files with 21 additions and 2 deletions
|
@ -1202,7 +1202,8 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async pageFinished(data: PageState) {
|
||||
// if page loaded, considered page finished successfully
|
||||
// (even if behaviors timed out)
|
||||
const { loadState, logDetails, depth, url, pageSkipped } = data;
|
||||
const { loadState, logDetails, depth, url, pageSkipped, skipRetries } =
|
||||
data;
|
||||
|
||||
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
||||
await this.writePage(data);
|
||||
|
@ -1222,7 +1223,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (pageSkipped) {
|
||||
await this.crawlState.markExcluded(url);
|
||||
} else {
|
||||
const retry = await this.crawlState.markFailed(url);
|
||||
const retry = await this.crawlState.markFailed(url, skipRetries);
|
||||
|
||||
if (this.healthChecker) {
|
||||
this.healthChecker.incError();
|
||||
|
@ -1974,9 +1975,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
let firstResponse: HTTPResponse | null = null;
|
||||
let fullLoadedResponse: HTTPResponse | null = null;
|
||||
|
||||
let isErrStatusCode = false;
|
||||
|
||||
// Detect if failure is actually caused by trying to load a non-page (eg. downloadable PDF),
|
||||
// store the downloadResponse, if any
|
||||
page.once("requestfailed", (req: HTTPRequest) => {
|
||||
isErrStatusCode =
|
||||
req.failure()?.errorText === "net::ERR_HTTP_RESPONSE_CODE_FAILURE";
|
||||
downloadResponse = getDownloadResponse(req);
|
||||
});
|
||||
|
||||
|
@ -2103,6 +2108,12 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
if (failed) {
|
||||
// failure due to status code, don't retry
|
||||
if (isErrStatusCode) {
|
||||
data.skipRetries = true;
|
||||
return;
|
||||
}
|
||||
|
||||
return this.pageFailed(
|
||||
isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
|
||||
retry,
|
||||
|
|
|
@ -468,6 +468,13 @@ export class Recorder {
|
|||
}
|
||||
break;
|
||||
|
||||
case "net::ERR_HTTP_RESPONSE_CODE_FAILURE":
|
||||
if (reqresp.status !== 200) {
|
||||
this.removeReqResp(requestId);
|
||||
return this.serializeToWARC(reqresp);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
logger.warn(
|
||||
"Request failed",
|
||||
|
|
|
@ -76,6 +76,7 @@ export class PageState {
|
|||
|
||||
skipBehaviors = false;
|
||||
pageSkipped = false;
|
||||
skipRetries = false;
|
||||
filteredFrames: Frame[] = [];
|
||||
loadState: LoadState = LoadState.FAILED;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue