mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
store page statusCode if not 200 (#477)
don't treat non-200 pages as errors, still extract text, take screenshots, and run behaviors only consider actual page load errors, eg. chrome-error:// page url, as errors
This commit is contained in:
parent
fba4730d88
commit
c348de270f
2 changed files with 22 additions and 17 deletions
|
@ -91,6 +91,7 @@ type PageEntry = {
|
|||
text?: string;
|
||||
favIconUrl?: string;
|
||||
ts?: string;
|
||||
status?: number;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
|
@ -719,6 +720,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (mime) {
|
||||
data.mime = mime;
|
||||
}
|
||||
data.status = 200;
|
||||
logger.info(
|
||||
"Direct fetch successful",
|
||||
{ url, ...logDetails },
|
||||
|
@ -787,6 +789,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
data.loadState = LoadState.EXTRACTION_DONE;
|
||||
|
||||
if (data.status >= 400) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.params.behaviorOpts) {
|
||||
if (!data.isHTMLPage) {
|
||||
logger.debug(
|
||||
|
@ -1581,28 +1587,19 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
// Handle 4xx or 5xx response as a page load error
|
||||
const statusCode = resp.status();
|
||||
const statusString = statusCode.toString();
|
||||
if (
|
||||
statusString.startsWith("4") ||
|
||||
statusString.startsWith("5") ||
|
||||
isChromeError
|
||||
) {
|
||||
const status = resp.status();
|
||||
data.status = status;
|
||||
if (isChromeError) {
|
||||
if (failCrawlOnError) {
|
||||
logger.fatal("Seed Page Load Error, failing crawl", {
|
||||
statusCode,
|
||||
status,
|
||||
...logDetails,
|
||||
});
|
||||
} else {
|
||||
logger.error(
|
||||
isChromeError
|
||||
? "Page Crashed on Load"
|
||||
: "Non-200 Status Code, skipping page",
|
||||
{
|
||||
statusCode,
|
||||
logger.error("Page Crashed on Load", {
|
||||
status,
|
||||
...logDetails,
|
||||
},
|
||||
);
|
||||
});
|
||||
throw new Error("logged");
|
||||
}
|
||||
}
|
||||
|
@ -1963,6 +1960,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
mime,
|
||||
favicon,
|
||||
ts,
|
||||
status,
|
||||
}: PageState) {
|
||||
const row: PageEntry = { id: pageid!, url, title, loadState };
|
||||
|
||||
|
@ -1974,6 +1972,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
row.mime = mime;
|
||||
}
|
||||
|
||||
if (status) {
|
||||
row.status = status;
|
||||
}
|
||||
|
||||
if (this.params.writePagesToRedis) {
|
||||
await this.crawlState.writeToPagesQueue(JSON.stringify(row));
|
||||
}
|
||||
|
|
|
@ -46,6 +46,8 @@ export class PageState {
|
|||
depth: number;
|
||||
extraHops: number;
|
||||
|
||||
status: number;
|
||||
|
||||
workerid!: WorkerId;
|
||||
|
||||
pageid?: string;
|
||||
|
@ -70,6 +72,7 @@ export class PageState {
|
|||
this.seedId = redisData.seedId;
|
||||
this.depth = redisData.depth;
|
||||
this.extraHops = redisData.extraHops || 0;
|
||||
this.status = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue