From 993081d3eefac569f64621ed763cd355c83a2be9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 5 Dec 2025 16:56:42 -0800 Subject: [PATCH] better handling of net::ERR_HTTP_RESPONSE_CODE_FAILURE: (#934) - http headers provided but no payload, record response - record page as failed with status code provided, don't attempt to retry --- src/crawler.ts | 17 ++++++++++++++--- src/util/recorder.ts | 10 ++++++++++ src/util/state.ts | 2 ++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index e369fcbc..6a0f3f71 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1327,7 +1327,7 @@ self.__bx_behaviors.selectMainBehavior(); } // if page loaded, considered page finished successfully // (even if behaviors timed out) - const { loadState, logDetails, depth, url, pageSkipped } = data; + const { loadState, logDetails, depth, url, pageSkipped, noRetries } = data; if (data.loadState >= LoadState.FULL_PAGE_LOADED) { await this.writePage(data); @@ -1347,7 +1347,7 @@ self.__bx_behaviors.selectMainBehavior(); if (pageSkipped) { await this.crawlState.markExcluded(url); } else { - const retry = await this.crawlState.markFailed(url); + const retry = await this.crawlState.markFailed(url, noRetries); if (this.healthChecker) { this.healthChecker.incError(); @@ -2215,8 +2215,8 @@ self.__bx_behaviors.selectMainBehavior(); if (msg !== "logged") { const loadState = data.loadState; - // excluded in recorder if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) { + // excluded in recorder data.pageSkipped = true; logger.warn("Page Load Blocked, skipping", { msg, loadState }); } else { @@ -2274,6 +2274,17 @@ self.__bx_behaviors.selectMainBehavior(); } if (failed) { + const failText = resp.request().failure()?.errorText; + if (isChromeError && failText === "net::ERR_HTTP_RESPONSE_CODE_FAILURE") { + data.noRetries = true; + logger.warn( + "Page is an empty non-200 response, not retrying", + { url, status, ...logDetails }, + "pageStatus", + ); + throw new Error("logged"); + } + return this.pageFailed( isChromeError ? "Page Crashed on Load" : "Page Invalid Status", retry, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index d36930c5..ccd2c17d 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -464,6 +464,16 @@ export class Recorder extends EventEmitter { } break; + case "net::ERR_HTTP_RESPONSE_CODE_FAILURE": + logger.warn("Recording empty non-200 status response", { + url, + status: reqresp.status, + errorText, + type, + ...this.logDetails, + }); + return this.serializeToWARC(reqresp); + default: this.lastErrorText = errorText; logger.warn( diff --git a/src/util/state.ts b/src/util/state.ts index 3df430fc..bb973b11 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -89,6 +89,8 @@ export class PageState { skipBehaviors = false; pageSkipped = false; + noRetries = false; + asyncLoading = false; filteredFrames: Frame[] = []; loadState: LoadState = LoadState.FAILED;