Make numRetries configurable (#754)

Add --numRetries param, default to 1 instead of 5.
This commit is contained in:
Ilya Kreymer 2025-02-05 23:34:55 -08:00 committed by GitHub
parent f379da19be
commit 2e46140c3f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 30 additions and 12 deletions

View file

@ -240,6 +240,9 @@ Options:
s [boolean] [default: false] s [boolean] [default: false]
--writePagesToRedis If set, write page objects to redis --writePagesToRedis If set, write page objects to redis
[boolean] [default: false] [boolean] [default: false]
--numRetries If set, number of times to retry a p
age that failed to load
[number] [default: 1]
--failOnFailedSeed If set, crawler will fail with exit --failOnFailedSeed If set, crawler will fail with exit
code 1 if any seed fails. When combi code 1 if any seed fails. When combi
ned with --failOnInvalidStatus,will ned with --failOnInvalidStatus,will

View file

@ -46,7 +46,6 @@ import {
ExtractSelector, ExtractSelector,
PAGE_OP_TIMEOUT_SECS, PAGE_OP_TIMEOUT_SECS,
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS, SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
MAX_RETRY_FAILED,
} from "./util/constants.js"; } from "./util/constants.js";
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js"; import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
@ -378,6 +377,7 @@ export class Crawler {
this.crawlId, this.crawlId,
this.maxPageTime, this.maxPageTime,
os.hostname(), os.hostname(),
this.params.numRetries,
); );
// load full state from config // load full state from config
@ -1189,7 +1189,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.checkLimits(); await this.checkLimits();
} else { } else {
if (retry >= MAX_RETRY_FAILED && !pageSkipped) { if (retry >= this.params.numRetries && !pageSkipped) {
await this.writePage(data); await this.writePage(data);
} }
if (pageSkipped) { if (pageSkipped) {

View file

@ -17,6 +17,7 @@ import {
DEFAULT_SELECTORS, DEFAULT_SELECTORS,
BEHAVIOR_TYPES, BEHAVIOR_TYPES,
ExtractSelector, ExtractSelector,
DEFAULT_NUM_RETRIES,
} from "./constants.js"; } from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
@ -549,6 +550,13 @@ class ArgParser {
default: false, default: false,
}, },
numRetries: {
describe:
"If set, number of times to retry a page that failed to load",
type: "number",
default: DEFAULT_NUM_RETRIES,
},
failOnFailedSeed: { failOnFailedSeed: {
describe: describe:
"If set, crawler will fail with exit code 1 if any seed fails. When combined with --failOnInvalidStatus," + "If set, crawler will fail with exit code 1 if any seed fails. When combined with --failOnInvalidStatus," +

View file

@ -27,7 +27,7 @@ export const ADD_LINK_FUNC = "__bx_addLink";
export const FETCH_FUNC = "__bx_fetch"; export const FETCH_FUNC = "__bx_fetch";
export const MAX_DEPTH = 1000000; export const MAX_DEPTH = 1000000;
export const MAX_RETRY_FAILED = 5; export const DEFAULT_NUM_RETRIES = 1;
export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5; export const PAGE_OP_TIMEOUT_SECS = 5;

View file

@ -3,7 +3,7 @@ import { v4 as uuidv4 } from "uuid";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { MAX_DEPTH, MAX_RETRY_FAILED } from "./constants.js"; import { MAX_DEPTH, DEFAULT_NUM_RETRIES } from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core"; import { Frame } from "puppeteer-core";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
@ -126,7 +126,7 @@ declare module "ioredis" {
fkey: string, fkey: string,
qkey: string, qkey: string,
ffkey: string, ffkey: string,
maxRetryPending: number, maxRetries: number,
maxRegularDepth: number, maxRegularDepth: number,
): Result<number, Context>; ): Result<number, Context>;
@ -141,7 +141,7 @@ declare module "ioredis" {
qkey: string, qkey: string,
pkeyUrl: string, pkeyUrl: string,
url: string, url: string,
maxRetryPending: number, maxRetries: number,
maxRegularDepth: number, maxRegularDepth: number,
): Result<number, Context>; ): Result<number, Context>;
@ -170,7 +170,7 @@ export type SaveState = {
// ============================================================================ // ============================================================================
export class RedisCrawlState { export class RedisCrawlState {
redis: Redis; redis: Redis;
maxRetryPending = MAX_RETRY_FAILED; maxRetries: number;
uid: string; uid: string;
key: string; key: string;
@ -191,12 +191,19 @@ export class RedisCrawlState {
waczFilename: string | null = null; waczFilename: string | null = null;
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) { constructor(
redis: Redis,
key: string,
maxPageTime: number,
uid: string,
maxRetries?: number,
) {
this.redis = redis; this.redis = redis;
this.uid = uid; this.uid = uid;
this.key = key; this.key = key;
this.maxPageTime = maxPageTime; this.maxPageTime = maxPageTime;
this.maxRetries = maxRetries || DEFAULT_NUM_RETRIES;
this.qkey = this.key + ":q"; this.qkey = this.key + ":q";
this.pkey = this.key + ":p"; this.pkey = this.key + ":p";
@ -609,7 +616,7 @@ return inx;
this.fkey, this.fkey,
this.qkey, this.qkey,
this.ffkey, this.ffkey,
this.maxRetryPending, this.maxRetries,
MAX_DEPTH, MAX_DEPTH,
); );
@ -835,7 +842,7 @@ return inx;
for (const json of state.failed) { for (const json of state.failed) {
const data = JSON.parse(json); const data = JSON.parse(json);
const retry = data.retry || 0; const retry = data.retry || 0;
if (retry <= this.maxRetryPending) { if (retry <= this.maxRetries) {
await this.redis.zadd(this.qkey, this._getScore(data), json); await this.redis.zadd(this.qkey, this._getScore(data), json);
} else { } else {
await this.redis.rpush(this.ffkey, json); await this.redis.rpush(this.ffkey, json);
@ -904,7 +911,7 @@ return inx;
this.qkey, this.qkey,
this.pkey + ":" + url, this.pkey + ":" + url,
url, url,
this.maxRetryPending, this.maxRetries,
MAX_DEPTH, MAX_DEPTH,
); );
switch (res) { switch (res) {

View file

@ -10,7 +10,7 @@ async function sleep(time) {
test("run crawl", async () => { test("run crawl", async () => {
let status = 0; let status = 0;
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail`); execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --numRetries 5`);
/* /*
async function runServer() { async function runServer() {