better handling of net::ERR_HTTP_RESPONSE_CODE_FAILURE: (#934)

- http headers provided but no payload, record response
- record page as failed with status code provided, don't attempt to
retry
This commit is contained in:
Ilya Kreymer 2025-12-05 16:56:42 -08:00 committed by GitHub
parent 822de93301
commit 993081d3ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 26 additions and 3 deletions

View file

@ -1327,7 +1327,7 @@ self.__bx_behaviors.selectMainBehavior();
} }
// if page loaded, considered page finished successfully // if page loaded, considered page finished successfully
// (even if behaviors timed out) // (even if behaviors timed out)
const { loadState, logDetails, depth, url, pageSkipped } = data; const { loadState, logDetails, depth, url, pageSkipped, noRetries } = data;
if (data.loadState >= LoadState.FULL_PAGE_LOADED) { if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
await this.writePage(data); await this.writePage(data);
@ -1347,7 +1347,7 @@ self.__bx_behaviors.selectMainBehavior();
if (pageSkipped) { if (pageSkipped) {
await this.crawlState.markExcluded(url); await this.crawlState.markExcluded(url);
} else { } else {
const retry = await this.crawlState.markFailed(url); const retry = await this.crawlState.markFailed(url, noRetries);
if (this.healthChecker) { if (this.healthChecker) {
this.healthChecker.incError(); this.healthChecker.incError();
@ -2215,8 +2215,8 @@ self.__bx_behaviors.selectMainBehavior();
if (msg !== "logged") { if (msg !== "logged") {
const loadState = data.loadState; const loadState = data.loadState;
// excluded in recorder
if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) { if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
// excluded in recorder
data.pageSkipped = true; data.pageSkipped = true;
logger.warn("Page Load Blocked, skipping", { msg, loadState }); logger.warn("Page Load Blocked, skipping", { msg, loadState });
} else { } else {
@ -2274,6 +2274,17 @@ self.__bx_behaviors.selectMainBehavior();
} }
if (failed) { if (failed) {
const failText = resp.request().failure()?.errorText;
if (isChromeError && failText === "net::ERR_HTTP_RESPONSE_CODE_FAILURE") {
data.noRetries = true;
logger.warn(
"Page is an empty non-200 response, not retrying",
{ url, status, ...logDetails },
"pageStatus",
);
throw new Error("logged");
}
return this.pageFailed( return this.pageFailed(
isChromeError ? "Page Crashed on Load" : "Page Invalid Status", isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
retry, retry,

View file

@ -464,6 +464,16 @@ export class Recorder extends EventEmitter {
} }
break; break;
case "net::ERR_HTTP_RESPONSE_CODE_FAILURE":
logger.warn("Recording empty non-200 status response", {
url,
status: reqresp.status,
errorText,
type,
...this.logDetails,
});
return this.serializeToWARC(reqresp);
default: default:
this.lastErrorText = errorText; this.lastErrorText = errorText;
logger.warn( logger.warn(

View file

@ -89,6 +89,8 @@ export class PageState {
skipBehaviors = false; skipBehaviors = false;
pageSkipped = false; pageSkipped = false;
noRetries = false;
asyncLoading = false; asyncLoading = false;
filteredFrames: Frame[] = []; filteredFrames: Frame[] = [];
loadState: LoadState = LoadState.FAILED; loadState: LoadState = LoadState.FAILED;