mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Make numRetries configurable (#754)
Add --numRetries param, default to 1 instead of 5.
This commit is contained in:
parent
f379da19be
commit
2e46140c3f
6 changed files with 30 additions and 12 deletions
|
@ -240,6 +240,9 @@ Options:
|
||||||
s [boolean] [default: false]
|
s [boolean] [default: false]
|
||||||
--writePagesToRedis If set, write page objects to redis
|
--writePagesToRedis If set, write page objects to redis
|
||||||
[boolean] [default: false]
|
[boolean] [default: false]
|
||||||
|
--numRetries If set, number of times to retry a p
|
||||||
|
age that failed to load
|
||||||
|
[number] [default: 1]
|
||||||
--failOnFailedSeed If set, crawler will fail with exit
|
--failOnFailedSeed If set, crawler will fail with exit
|
||||||
code 1 if any seed fails. When combi
|
code 1 if any seed fails. When combi
|
||||||
ned with --failOnInvalidStatus,will
|
ned with --failOnInvalidStatus,will
|
||||||
|
|
|
@ -46,7 +46,6 @@ import {
|
||||||
ExtractSelector,
|
ExtractSelector,
|
||||||
PAGE_OP_TIMEOUT_SECS,
|
PAGE_OP_TIMEOUT_SECS,
|
||||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||||
MAX_RETRY_FAILED,
|
|
||||||
} from "./util/constants.js";
|
} from "./util/constants.js";
|
||||||
|
|
||||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||||
|
@ -378,6 +377,7 @@ export class Crawler {
|
||||||
this.crawlId,
|
this.crawlId,
|
||||||
this.maxPageTime,
|
this.maxPageTime,
|
||||||
os.hostname(),
|
os.hostname(),
|
||||||
|
this.params.numRetries,
|
||||||
);
|
);
|
||||||
|
|
||||||
// load full state from config
|
// load full state from config
|
||||||
|
@ -1189,7 +1189,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
await this.checkLimits();
|
await this.checkLimits();
|
||||||
} else {
|
} else {
|
||||||
if (retry >= MAX_RETRY_FAILED && !pageSkipped) {
|
if (retry >= this.params.numRetries && !pageSkipped) {
|
||||||
await this.writePage(data);
|
await this.writePage(data);
|
||||||
}
|
}
|
||||||
if (pageSkipped) {
|
if (pageSkipped) {
|
||||||
|
|
|
@ -17,6 +17,7 @@ import {
|
||||||
DEFAULT_SELECTORS,
|
DEFAULT_SELECTORS,
|
||||||
BEHAVIOR_TYPES,
|
BEHAVIOR_TYPES,
|
||||||
ExtractSelector,
|
ExtractSelector,
|
||||||
|
DEFAULT_NUM_RETRIES,
|
||||||
} from "./constants.js";
|
} from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
|
@ -549,6 +550,13 @@ class ArgParser {
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
numRetries: {
|
||||||
|
describe:
|
||||||
|
"If set, number of times to retry a page that failed to load",
|
||||||
|
type: "number",
|
||||||
|
default: DEFAULT_NUM_RETRIES,
|
||||||
|
},
|
||||||
|
|
||||||
failOnFailedSeed: {
|
failOnFailedSeed: {
|
||||||
describe:
|
describe:
|
||||||
"If set, crawler will fail with exit code 1 if any seed fails. When combined with --failOnInvalidStatus," +
|
"If set, crawler will fail with exit code 1 if any seed fails. When combined with --failOnInvalidStatus," +
|
||||||
|
|
|
@ -27,7 +27,7 @@ export const ADD_LINK_FUNC = "__bx_addLink";
|
||||||
export const FETCH_FUNC = "__bx_fetch";
|
export const FETCH_FUNC = "__bx_fetch";
|
||||||
|
|
||||||
export const MAX_DEPTH = 1000000;
|
export const MAX_DEPTH = 1000000;
|
||||||
export const MAX_RETRY_FAILED = 5;
|
export const DEFAULT_NUM_RETRIES = 1;
|
||||||
|
|
||||||
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
||||||
export const PAGE_OP_TIMEOUT_SECS = 5;
|
export const PAGE_OP_TIMEOUT_SECS = 5;
|
||||||
|
|
|
@ -3,7 +3,7 @@ import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
import { MAX_DEPTH, MAX_RETRY_FAILED } from "./constants.js";
|
import { MAX_DEPTH, DEFAULT_NUM_RETRIES } from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { Frame } from "puppeteer-core";
|
import { Frame } from "puppeteer-core";
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
|
@ -126,7 +126,7 @@ declare module "ioredis" {
|
||||||
fkey: string,
|
fkey: string,
|
||||||
qkey: string,
|
qkey: string,
|
||||||
ffkey: string,
|
ffkey: string,
|
||||||
maxRetryPending: number,
|
maxRetries: number,
|
||||||
maxRegularDepth: number,
|
maxRegularDepth: number,
|
||||||
): Result<number, Context>;
|
): Result<number, Context>;
|
||||||
|
|
||||||
|
@ -141,7 +141,7 @@ declare module "ioredis" {
|
||||||
qkey: string,
|
qkey: string,
|
||||||
pkeyUrl: string,
|
pkeyUrl: string,
|
||||||
url: string,
|
url: string,
|
||||||
maxRetryPending: number,
|
maxRetries: number,
|
||||||
maxRegularDepth: number,
|
maxRegularDepth: number,
|
||||||
): Result<number, Context>;
|
): Result<number, Context>;
|
||||||
|
|
||||||
|
@ -170,7 +170,7 @@ export type SaveState = {
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class RedisCrawlState {
|
export class RedisCrawlState {
|
||||||
redis: Redis;
|
redis: Redis;
|
||||||
maxRetryPending = MAX_RETRY_FAILED;
|
maxRetries: number;
|
||||||
|
|
||||||
uid: string;
|
uid: string;
|
||||||
key: string;
|
key: string;
|
||||||
|
@ -191,12 +191,19 @@ export class RedisCrawlState {
|
||||||
|
|
||||||
waczFilename: string | null = null;
|
waczFilename: string | null = null;
|
||||||
|
|
||||||
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
|
constructor(
|
||||||
|
redis: Redis,
|
||||||
|
key: string,
|
||||||
|
maxPageTime: number,
|
||||||
|
uid: string,
|
||||||
|
maxRetries?: number,
|
||||||
|
) {
|
||||||
this.redis = redis;
|
this.redis = redis;
|
||||||
|
|
||||||
this.uid = uid;
|
this.uid = uid;
|
||||||
this.key = key;
|
this.key = key;
|
||||||
this.maxPageTime = maxPageTime;
|
this.maxPageTime = maxPageTime;
|
||||||
|
this.maxRetries = maxRetries || DEFAULT_NUM_RETRIES;
|
||||||
|
|
||||||
this.qkey = this.key + ":q";
|
this.qkey = this.key + ":q";
|
||||||
this.pkey = this.key + ":p";
|
this.pkey = this.key + ":p";
|
||||||
|
@ -609,7 +616,7 @@ return inx;
|
||||||
this.fkey,
|
this.fkey,
|
||||||
this.qkey,
|
this.qkey,
|
||||||
this.ffkey,
|
this.ffkey,
|
||||||
this.maxRetryPending,
|
this.maxRetries,
|
||||||
MAX_DEPTH,
|
MAX_DEPTH,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -835,7 +842,7 @@ return inx;
|
||||||
for (const json of state.failed) {
|
for (const json of state.failed) {
|
||||||
const data = JSON.parse(json);
|
const data = JSON.parse(json);
|
||||||
const retry = data.retry || 0;
|
const retry = data.retry || 0;
|
||||||
if (retry <= this.maxRetryPending) {
|
if (retry <= this.maxRetries) {
|
||||||
await this.redis.zadd(this.qkey, this._getScore(data), json);
|
await this.redis.zadd(this.qkey, this._getScore(data), json);
|
||||||
} else {
|
} else {
|
||||||
await this.redis.rpush(this.ffkey, json);
|
await this.redis.rpush(this.ffkey, json);
|
||||||
|
@ -904,7 +911,7 @@ return inx;
|
||||||
this.qkey,
|
this.qkey,
|
||||||
this.pkey + ":" + url,
|
this.pkey + ":" + url,
|
||||||
url,
|
url,
|
||||||
this.maxRetryPending,
|
this.maxRetries,
|
||||||
MAX_DEPTH,
|
MAX_DEPTH,
|
||||||
);
|
);
|
||||||
switch (res) {
|
switch (res) {
|
||||||
|
|
|
@ -10,7 +10,7 @@ async function sleep(time) {
|
||||||
|
|
||||||
test("run crawl", async () => {
|
test("run crawl", async () => {
|
||||||
let status = 0;
|
let status = 0;
|
||||||
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail`);
|
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --numRetries 5`);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
async function runServer() {
|
async function runServer() {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue