State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)

* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state



* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT

* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats

* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible

* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0

* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes

*  py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture

* update to latest browsertrix-behaviors

* fix setuptools dependency #88

* update README for 0.5.0 beta
This commit is contained in:
Ilya Kreymer 2021-09-28 09:41:16 -07:00 committed by GitHub
parent 2956be2026
commit 39ddecd35e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 544 additions and 6407 deletions

View file

@ -1,5 +1,6 @@
const path = require("path");
const fs = require("fs");
const os = require("os");
const yaml = require("js-yaml");
const puppeteer = require("puppeteer-core");
@ -12,7 +13,6 @@ const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
const { ScopedSeed } = require("./seeds");
// ============================================================================
class ArgParser {
get cliOpts() {
@ -37,6 +37,13 @@ class ArgParser {
type: "number",
},
"crawlId": {
alias: "id",
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var)",
type: "string",
default: process.env.CRAWL_ID || os.hostname(),
},
"newContext": {
describe: "The context for each new capture, can be a new: page, window, session or browser.",
default: "page",
@ -67,8 +74,9 @@ class ArgParser {
},
"scopeType": {
describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
describe: "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
type: "string",
choices: ["page", "page-spa", "prefix", "host", "any", "custom"]
},
"scopeIncludeRx": {
@ -211,6 +219,18 @@ class ArgParser {
alias: ["warcinfo"],
describe: "Optional fields added to the warcinfo record in combined WARCs",
type: "object"
},
"redisStoreUrl": {
describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
type: "string"
},
"saveState": {
describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
type: "string",
default: "partial",
choices: ["never", "partial", "always"]
}
};
}
@ -218,17 +238,26 @@ class ArgParser {
parseArgs(argv) {
argv = argv || process.argv;
return yargs(hideBin(argv))
if (process.env.CRAWL_ARGS) {
argv = argv.concat(process.env.CRAWL_ARGS.split(" "));
}
let origConfig = {};
const parsed = yargs(hideBin(argv))
.usage("crawler [options]")
.option(this.cliOpts)
.config("config", "Path to YAML config file", (configPath) => {
if (configPath === "/crawls/stdin") {
configPath = process.stdin.fd;
}
return yaml.load(fs.readFileSync(configPath, "utf8"));
origConfig = yaml.load(fs.readFileSync(configPath, "utf8"));
return origConfig;
})
.check((argv) => this.validateArgs(argv))
.argv;
return {parsed, origConfig};
}