mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
This commit is contained in:
parent
2956be2026
commit
39ddecd35e
13 changed files with 544 additions and 6407 deletions
|
@ -1,5 +1,6 @@
|
|||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
const os = require("os");
|
||||
|
||||
const yaml = require("js-yaml");
|
||||
const puppeteer = require("puppeteer-core");
|
||||
|
@ -12,7 +13,6 @@ const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
|
|||
const { ScopedSeed } = require("./seeds");
|
||||
|
||||
|
||||
|
||||
// ============================================================================
|
||||
class ArgParser {
|
||||
get cliOpts() {
|
||||
|
@ -37,6 +37,13 @@ class ArgParser {
|
|||
type: "number",
|
||||
},
|
||||
|
||||
"crawlId": {
|
||||
alias: "id",
|
||||
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var)",
|
||||
type: "string",
|
||||
default: process.env.CRAWL_ID || os.hostname(),
|
||||
},
|
||||
|
||||
"newContext": {
|
||||
describe: "The context for each new capture, can be a new: page, window, session or browser.",
|
||||
default: "page",
|
||||
|
@ -67,8 +74,9 @@ class ArgParser {
|
|||
},
|
||||
|
||||
"scopeType": {
|
||||
describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
|
||||
describe: "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
|
||||
type: "string",
|
||||
choices: ["page", "page-spa", "prefix", "host", "any", "custom"]
|
||||
},
|
||||
|
||||
"scopeIncludeRx": {
|
||||
|
@ -211,6 +219,18 @@ class ArgParser {
|
|||
alias: ["warcinfo"],
|
||||
describe: "Optional fields added to the warcinfo record in combined WARCs",
|
||||
type: "object"
|
||||
},
|
||||
|
||||
"redisStoreUrl": {
|
||||
describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
|
||||
type: "string"
|
||||
},
|
||||
|
||||
"saveState": {
|
||||
describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
|
||||
type: "string",
|
||||
default: "partial",
|
||||
choices: ["never", "partial", "always"]
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -218,17 +238,26 @@ class ArgParser {
|
|||
parseArgs(argv) {
|
||||
argv = argv || process.argv;
|
||||
|
||||
return yargs(hideBin(argv))
|
||||
if (process.env.CRAWL_ARGS) {
|
||||
argv = argv.concat(process.env.CRAWL_ARGS.split(" "));
|
||||
}
|
||||
|
||||
let origConfig = {};
|
||||
|
||||
const parsed = yargs(hideBin(argv))
|
||||
.usage("crawler [options]")
|
||||
.option(this.cliOpts)
|
||||
.config("config", "Path to YAML config file", (configPath) => {
|
||||
if (configPath === "/crawls/stdin") {
|
||||
configPath = process.stdin.fd;
|
||||
}
|
||||
return yaml.load(fs.readFileSync(configPath, "utf8"));
|
||||
origConfig = yaml.load(fs.readFileSync(configPath, "utf8"));
|
||||
return origConfig;
|
||||
})
|
||||
.check((argv) => this.validateArgs(argv))
|
||||
.argv;
|
||||
|
||||
return {parsed, origConfig};
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue