From 5961a521c28d1ca13eea96af6e7506c43419f578 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 17 Jan 2025 18:19:31 -0800 Subject: [PATCH] set failed URL retry to 5 by default --- src/crawler.ts | 2 +- src/util/constants.ts | 1 + src/util/state.ts | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index b1f12a08..63229618 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1921,7 +1921,7 @@ self.__bx_behaviors.selectMainBehavior(); } else if (!downloadResponse) { // log if not already log and rethrow, consider page failed if (msg !== "logged") { - logger.error("Page Load Failed, skipping page", { + logger.error("Page Load Failed, will retry", { msg, loadState: data.loadState, ...logDetails, diff --git a/src/util/constants.ts b/src/util/constants.ts index 72506c6c..0c5d6faf 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -27,6 +27,7 @@ export const ADD_LINK_FUNC = "__bx_addLink"; export const FETCH_FUNC = "__bx_fetch"; export const MAX_DEPTH = 1000000; +export const MAX_RETRY_FAILED = 5; export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; diff --git a/src/util/state.ts b/src/util/state.ts index 9388e478..3f5ad807 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -3,7 +3,7 @@ import { v4 as uuidv4 } from "uuid"; import { logger } from "./logger.js"; -import { MAX_DEPTH } from "./constants.js"; +import { MAX_DEPTH, MAX_RETRY_FAILED } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; @@ -170,7 +170,7 @@ export type SaveState = { // ============================================================================ export class RedisCrawlState { redis: Redis; - maxRetryPending = 1; + maxRetryPending = MAX_RETRY_FAILED; uid: string; key: string; @@ -608,7 +608,7 @@ return inx; } if (retryFailed) { - logger.debug("Retring failed URL", { url: data.url }, "state"); + logger.debug("Retrying failed URL", { url: data.url }, "state"); } await this.markStarted(data.url);