store page statusCode if not 200 (#477)

don't treat non-200 pages as errors, still extract text, take
screenshots, and run behaviors
only consider actual page load errors, eg. chrome-error:// page url, as
errors
This commit is contained in:
Ilya Kreymer 2024-02-28 22:56:12 -08:00 committed by GitHub
parent fba4730d88
commit c348de270f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 22 additions and 17 deletions

View file

@ -91,6 +91,7 @@ type PageEntry = {
text?: string;
favIconUrl?: string;
ts?: string;
status?: number;
};
// ============================================================================
@ -719,6 +720,7 @@ self.__bx_behaviors.selectMainBehavior();
if (mime) {
data.mime = mime;
}
data.status = 200;
logger.info(
"Direct fetch successful",
{ url, ...logDetails },
@ -787,6 +789,10 @@ self.__bx_behaviors.selectMainBehavior();
data.loadState = LoadState.EXTRACTION_DONE;
if (data.status >= 400) {
return;
}
if (this.params.behaviorOpts) {
if (!data.isHTMLPage) {
logger.debug(
@ -1581,28 +1587,19 @@ self.__bx_behaviors.selectMainBehavior();
}
// Handle 4xx or 5xx response as a page load error
const statusCode = resp.status();
const statusString = statusCode.toString();
if (
statusString.startsWith("4") ||
statusString.startsWith("5") ||
isChromeError
) {
const status = resp.status();
data.status = status;
if (isChromeError) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {
statusCode,
status,
...logDetails,
});
} else {
logger.error(
isChromeError
? "Page Crashed on Load"
: "Non-200 Status Code, skipping page",
{
statusCode,
logger.error("Page Crashed on Load", {
status,
...logDetails,
},
);
});
throw new Error("logged");
}
}
@ -1963,6 +1960,7 @@ self.__bx_behaviors.selectMainBehavior();
mime,
favicon,
ts,
status,
}: PageState) {
const row: PageEntry = { id: pageid!, url, title, loadState };
@ -1974,6 +1972,10 @@ self.__bx_behaviors.selectMainBehavior();
row.mime = mime;
}
if (status) {
row.status = status;
}
if (this.params.writePagesToRedis) {
await this.crawlState.writeToPagesQueue(JSON.stringify(row));
}

View file

@ -46,6 +46,8 @@ export class PageState {
depth: number;
extraHops: number;
status: number;
workerid!: WorkerId;
pageid?: string;
@ -70,6 +72,7 @@ export class PageState {
this.seedId = redisData.seedId;
this.depth = redisData.depth;
this.extraHops = redisData.extraHops || 0;
this.status = 0;
}
}