mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Make numRetries configurable (#754)
Add --numRetries param, default to 1 instead of 5.
This commit is contained in:
parent
f379da19be
commit
2e46140c3f
6 changed files with 30 additions and 12 deletions
|
@ -240,6 +240,9 @@ Options:
|
|||
s [boolean] [default: false]
|
||||
--writePagesToRedis If set, write page objects to redis
|
||||
[boolean] [default: false]
|
||||
--numRetries If set, number of times to retry a p
|
||||
age that failed to load
|
||||
[number] [default: 1]
|
||||
--failOnFailedSeed If set, crawler will fail with exit
|
||||
code 1 if any seed fails. When combi
|
||||
ned with --failOnInvalidStatus,will
|
||||
|
|
|
@ -46,7 +46,6 @@ import {
|
|||
ExtractSelector,
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||
MAX_RETRY_FAILED,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||
|
@ -378,6 +377,7 @@ export class Crawler {
|
|||
this.crawlId,
|
||||
this.maxPageTime,
|
||||
os.hostname(),
|
||||
this.params.numRetries,
|
||||
);
|
||||
|
||||
// load full state from config
|
||||
|
@ -1189,7 +1189,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
await this.checkLimits();
|
||||
} else {
|
||||
if (retry >= MAX_RETRY_FAILED && !pageSkipped) {
|
||||
if (retry >= this.params.numRetries && !pageSkipped) {
|
||||
await this.writePage(data);
|
||||
}
|
||||
if (pageSkipped) {
|
||||
|
|
|
@ -17,6 +17,7 @@ import {
|
|||
DEFAULT_SELECTORS,
|
||||
BEHAVIOR_TYPES,
|
||||
ExtractSelector,
|
||||
DEFAULT_NUM_RETRIES,
|
||||
} from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
|
@ -549,6 +550,13 @@ class ArgParser {
|
|||
default: false,
|
||||
},
|
||||
|
||||
numRetries: {
|
||||
describe:
|
||||
"If set, number of times to retry a page that failed to load",
|
||||
type: "number",
|
||||
default: DEFAULT_NUM_RETRIES,
|
||||
},
|
||||
|
||||
failOnFailedSeed: {
|
||||
describe:
|
||||
"If set, crawler will fail with exit code 1 if any seed fails. When combined with --failOnInvalidStatus," +
|
||||
|
|
|
@ -27,7 +27,7 @@ export const ADD_LINK_FUNC = "__bx_addLink";
|
|||
export const FETCH_FUNC = "__bx_fetch";
|
||||
|
||||
export const MAX_DEPTH = 1000000;
|
||||
export const MAX_RETRY_FAILED = 5;
|
||||
export const DEFAULT_NUM_RETRIES = 1;
|
||||
|
||||
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
||||
export const PAGE_OP_TIMEOUT_SECS = 5;
|
||||
|
|
|
@ -3,7 +3,7 @@ import { v4 as uuidv4 } from "uuid";
|
|||
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
import { MAX_DEPTH, MAX_RETRY_FAILED } from "./constants.js";
|
||||
import { MAX_DEPTH, DEFAULT_NUM_RETRIES } from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { Frame } from "puppeteer-core";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
|
@ -126,7 +126,7 @@ declare module "ioredis" {
|
|||
fkey: string,
|
||||
qkey: string,
|
||||
ffkey: string,
|
||||
maxRetryPending: number,
|
||||
maxRetries: number,
|
||||
maxRegularDepth: number,
|
||||
): Result<number, Context>;
|
||||
|
||||
|
@ -141,7 +141,7 @@ declare module "ioredis" {
|
|||
qkey: string,
|
||||
pkeyUrl: string,
|
||||
url: string,
|
||||
maxRetryPending: number,
|
||||
maxRetries: number,
|
||||
maxRegularDepth: number,
|
||||
): Result<number, Context>;
|
||||
|
||||
|
@ -170,7 +170,7 @@ export type SaveState = {
|
|||
// ============================================================================
|
||||
export class RedisCrawlState {
|
||||
redis: Redis;
|
||||
maxRetryPending = MAX_RETRY_FAILED;
|
||||
maxRetries: number;
|
||||
|
||||
uid: string;
|
||||
key: string;
|
||||
|
@ -191,12 +191,19 @@ export class RedisCrawlState {
|
|||
|
||||
waczFilename: string | null = null;
|
||||
|
||||
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
|
||||
constructor(
|
||||
redis: Redis,
|
||||
key: string,
|
||||
maxPageTime: number,
|
||||
uid: string,
|
||||
maxRetries?: number,
|
||||
) {
|
||||
this.redis = redis;
|
||||
|
||||
this.uid = uid;
|
||||
this.key = key;
|
||||
this.maxPageTime = maxPageTime;
|
||||
this.maxRetries = maxRetries || DEFAULT_NUM_RETRIES;
|
||||
|
||||
this.qkey = this.key + ":q";
|
||||
this.pkey = this.key + ":p";
|
||||
|
@ -609,7 +616,7 @@ return inx;
|
|||
this.fkey,
|
||||
this.qkey,
|
||||
this.ffkey,
|
||||
this.maxRetryPending,
|
||||
this.maxRetries,
|
||||
MAX_DEPTH,
|
||||
);
|
||||
|
||||
|
@ -835,7 +842,7 @@ return inx;
|
|||
for (const json of state.failed) {
|
||||
const data = JSON.parse(json);
|
||||
const retry = data.retry || 0;
|
||||
if (retry <= this.maxRetryPending) {
|
||||
if (retry <= this.maxRetries) {
|
||||
await this.redis.zadd(this.qkey, this._getScore(data), json);
|
||||
} else {
|
||||
await this.redis.rpush(this.ffkey, json);
|
||||
|
@ -904,7 +911,7 @@ return inx;
|
|||
this.qkey,
|
||||
this.pkey + ":" + url,
|
||||
url,
|
||||
this.maxRetryPending,
|
||||
this.maxRetries,
|
||||
MAX_DEPTH,
|
||||
);
|
||||
switch (res) {
|
||||
|
|
|
@ -10,7 +10,7 @@ async function sleep(time) {
|
|||
|
||||
test("run crawl", async () => {
|
||||
let status = 0;
|
||||
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail`);
|
||||
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --numRetries 5`);
|
||||
|
||||
/*
|
||||
async function runServer() {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue