mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Retry same queue (#757)
- follow up to #743 - page retries are simply added back to the same queue with `retry` param incremented and a higher scope, after extraHops, to ensure retries are added at the end. - score calculation is: `score = depth + (extraHops * MAX_DEPTH) + (retry * MAX_DEPTH * 2)`, this ensures that retries have lower priority than extraHops, and additional retries even lower priority (higher score). - warning is logged when a retry happens, error only when all retries are exhausted. - back to one failure list, urls added there only when all retries are exhausted. - rename --numRetries -> --maxRetries / --retries for clarity - state load: allow retrying previously failed URLs if --maxRetries is higher then on previous run. - ensure working with --failOnFailedStatus, if provided, invalid status codes (>= 400) are retried along with page load failures - fixes #132 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
5c9d808651
commit
00835fc4f2
7 changed files with 218 additions and 131 deletions
|
@ -17,7 +17,7 @@ import {
|
|||
DEFAULT_SELECTORS,
|
||||
BEHAVIOR_TYPES,
|
||||
ExtractSelector,
|
||||
DEFAULT_NUM_RETRIES,
|
||||
DEFAULT_MAX_RETRIES,
|
||||
} from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
|
@ -550,11 +550,12 @@ class ArgParser {
|
|||
default: false,
|
||||
},
|
||||
|
||||
numRetries: {
|
||||
maxPageRetries: {
|
||||
alias: "retries",
|
||||
describe:
|
||||
"If set, number of times to retry a page that failed to load",
|
||||
"If set, number of times to retry a page that failed to load before page is considered to have failed",
|
||||
type: "number",
|
||||
default: DEFAULT_NUM_RETRIES,
|
||||
default: DEFAULT_MAX_RETRIES,
|
||||
},
|
||||
|
||||
failOnFailedSeed: {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue