better handling of net::ERR_HTTP_RESPONSE_CODE_FAILURE: (#934)

- http headers provided but no payload, record response
- record page as failed with status code provided, don't attempt to
retry
This commit is contained in:
Ilya Kreymer 2025-12-05 16:56:42 -08:00 committed by GitHub
parent 822de93301
commit 993081d3ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 26 additions and 3 deletions

View file

@ -1327,7 +1327,7 @@ self.__bx_behaviors.selectMainBehavior();
}
// if page loaded, considered page finished successfully
// (even if behaviors timed out)
const { loadState, logDetails, depth, url, pageSkipped } = data;
const { loadState, logDetails, depth, url, pageSkipped, noRetries } = data;
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
await this.writePage(data);
@ -1347,7 +1347,7 @@ self.__bx_behaviors.selectMainBehavior();
if (pageSkipped) {
await this.crawlState.markExcluded(url);
} else {
const retry = await this.crawlState.markFailed(url);
const retry = await this.crawlState.markFailed(url, noRetries);
if (this.healthChecker) {
this.healthChecker.incError();
@ -2215,8 +2215,8 @@ self.__bx_behaviors.selectMainBehavior();
if (msg !== "logged") {
const loadState = data.loadState;
// excluded in recorder
if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
// excluded in recorder
data.pageSkipped = true;
logger.warn("Page Load Blocked, skipping", { msg, loadState });
} else {
@ -2274,6 +2274,17 @@ self.__bx_behaviors.selectMainBehavior();
}
if (failed) {
const failText = resp.request().failure()?.errorText;
if (isChromeError && failText === "net::ERR_HTTP_RESPONSE_CODE_FAILURE") {
data.noRetries = true;
logger.warn(
"Page is an empty non-200 response, not retrying",
{ url, status, ...logDetails },
"pageStatus",
);
throw new Error("logged");
}
return this.pageFailed(
isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
retry,

View file

@ -464,6 +464,16 @@ export class Recorder extends EventEmitter {
}
break;
case "net::ERR_HTTP_RESPONSE_CODE_FAILURE":
logger.warn("Recording empty non-200 status response", {
url,
status: reqresp.status,
errorText,
type,
...this.logDetails,
});
return this.serializeToWARC(reqresp);
default:
this.lastErrorText = errorText;
logger.warn(

View file

@ -89,6 +89,8 @@ export class PageState {
skipBehaviors = false;
pageSkipped = false;
noRetries = false;
asyncLoading = false;
filteredFrames: Frame[] = [];
loadState: LoadState = LoadState.FAILED;