From c348de270f8637df24e1acd5d5d7b5318869fc07 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 28 Feb 2024 22:56:12 -0800 Subject: [PATCH] store page statusCode if not 200 (#477) don't treat non-200 pages as errors, still extract text, take screenshots, and run behaviors only consider actual page load errors, eg. chrome-error:// page url, as errors --- src/crawler.ts | 36 +++++++++++++++++++----------------- src/util/state.ts | 3 +++ 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 917d7671..5bd2925f 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -91,6 +91,7 @@ type PageEntry = { text?: string; favIconUrl?: string; ts?: string; + status?: number; }; // ============================================================================ @@ -719,6 +720,7 @@ self.__bx_behaviors.selectMainBehavior(); if (mime) { data.mime = mime; } + data.status = 200; logger.info( "Direct fetch successful", { url, ...logDetails }, @@ -787,6 +789,10 @@ self.__bx_behaviors.selectMainBehavior(); data.loadState = LoadState.EXTRACTION_DONE; + if (data.status >= 400) { + return; + } + if (this.params.behaviorOpts) { if (!data.isHTMLPage) { logger.debug( @@ -1581,28 +1587,19 @@ self.__bx_behaviors.selectMainBehavior(); } // Handle 4xx or 5xx response as a page load error - const statusCode = resp.status(); - const statusString = statusCode.toString(); - if ( - statusString.startsWith("4") || - statusString.startsWith("5") || - isChromeError - ) { + const status = resp.status(); + data.status = status; + if (isChromeError) { if (failCrawlOnError) { logger.fatal("Seed Page Load Error, failing crawl", { - statusCode, + status, ...logDetails, }); } else { - logger.error( - isChromeError - ? "Page Crashed on Load" - : "Non-200 Status Code, skipping page", - { - statusCode, - ...logDetails, - }, - ); + logger.error("Page Crashed on Load", { + status, + ...logDetails, + }); throw new Error("logged"); } } @@ -1963,6 +1960,7 @@ self.__bx_behaviors.selectMainBehavior(); mime, favicon, ts, + status, }: PageState) { const row: PageEntry = { id: pageid!, url, title, loadState }; @@ -1974,6 +1972,10 @@ self.__bx_behaviors.selectMainBehavior(); row.mime = mime; } + if (status) { + row.status = status; + } + if (this.params.writePagesToRedis) { await this.crawlState.writeToPagesQueue(JSON.stringify(row)); } diff --git a/src/util/state.ts b/src/util/state.ts index 8052cad0..c28fa814 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -46,6 +46,8 @@ export class PageState { depth: number; extraHops: number; + status: number; + workerid!: WorkerId; pageid?: string; @@ -70,6 +72,7 @@ export class PageState { this.seedId = redisData.seedId; this.depth = redisData.depth; this.extraHops = redisData.extraHops || 0; + this.status = 0; } }