State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)

* save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2025-10-19 14:33:17 +00:00 · 2021-09-28 09:41:16 -07:00 · 2021-09-28 09:41:16 -07:00 · 39ddecd35e
commit 39ddecd35e
parent 2956be2026
13 changed files with 544 additions and 6407 deletions
--- a/util/argParser.js
+++ b/util/argParser.js
@ -1,5 +1,6 @@
 const path = require("path");
 const fs = require("fs");
+const os = require("os");

 const yaml = require("js-yaml");
 const puppeteer = require("puppeteer-core");
@ -12,7 +13,6 @@ const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
 const { ScopedSeed } = require("./seeds");


-
 // ============================================================================
 class ArgParser {
  get cliOpts() {
@ -37,6 +37,13 @@ class ArgParser {
        type: "number",
      },

+      "crawlId": {
+        alias: "id",
+        describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var)",
+        type: "string",
+        default: process.env.CRAWL_ID || os.hostname(),
+      },
+
      "newContext": {
        describe: "The context for each new capture, can be a new: page, window, session or browser.",
        default: "page",
@ -67,8 +74,9 @@ class ArgParser {
      },

      "scopeType": {
-        describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
+        describe: "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
        type: "string",
+        choices: ["page", "page-spa", "prefix", "host", "any", "custom"]
      },

      "scopeIncludeRx": {
@ -211,6 +219,18 @@ class ArgParser {
        alias: ["warcinfo"],
        describe: "Optional fields added to the warcinfo record in combined WARCs",
        type: "object"
+      },
+
+      "redisStoreUrl": {
+        describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
+        type: "string"
+      },
+
+      "saveState": {
+        describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
+        type: "string",
+        default: "partial",
+        choices: ["never", "partial", "always"]
      }
    };
  }
@ -218,17 +238,26 @@ class ArgParser {
  parseArgs(argv) {
    argv = argv || process.argv;

-    return yargs(hideBin(argv))
+    if (process.env.CRAWL_ARGS) {
+      argv = argv.concat(process.env.CRAWL_ARGS.split(" "));
+    }
+
+    let origConfig = {};
+
+    const parsed = yargs(hideBin(argv))
      .usage("crawler [options]")
      .option(this.cliOpts)
      .config("config", "Path to YAML config file", (configPath) => {
        if (configPath === "/crawls/stdin") {
          configPath = process.stdin.fd;
        }
-        return yaml.load(fs.readFileSync(configPath, "utf8"));
+        origConfig = yaml.load(fs.readFileSync(configPath, "utf8"));
+        return origConfig;
      })
      .check((argv) => this.validateArgs(argv))
      .argv;
+
+    return {parsed, origConfig};
  }