More flexible multi value arg parsing + README update for 0.12.0 (#422)

Updated arg parsing thanks to example in https://github.com/yargs/yargs/issues/846#issuecomment-517264899 to support multiple value arguments specified as either one string or multiple string using array type + coerce function. This allows for `choice` option to also be used to validate the options, when needed. With this setup, `--text to-pages,to-warc,final-to-warc`, `--text to-pages,to-warc --text final-to-warc` and `--text to-pages --text to-warc --text final-to-warc` all result in the same configuration! Updated other multiple choice args (waitUntil, logging, logLevel, context, behaviors, screenshot) to use the same system. Also updated README with new text extraction options and bumped version to 0.12.0 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2025-10-19 06:23:16 +00:00 · 2023-11-02 11:47:37 -07:00 · 2023-11-02 11:47:37 -07:00 · 15661eb9c8
commit 15661eb9c8
parent 2aeda56d40
4 changed files with 84 additions and 92 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -1,6 +1,8 @@
 name: Node.js CI

-on: [push]
+on:
+  push:
+  pull_request:

 jobs:
  lint:
--- a/README.md
+++ b/README.md
@ -35,7 +35,7 @@ the following commands. Replace `[URL]` with the web site you'd like to crawl.

 Here's how you can use some of the command-line options to configure the crawl:

- To include automated text extraction for full text search, add the `--text` flag.
+- To include automated text extraction for full text search to pages.jsonl, add the `--text` flag. To write extracted text to WARCs instead of or in addition to pages.jsonl, see Text Extraction below.

 - To limit the crawl to a maximum number of pages, add `--pageLimit P` where P is the number of pages that will be crawled.

@ -68,15 +68,13 @@ Options:
                                            llel           [number] [default: 1]
      --crawlId, --id                       A user provided ID for this crawl or
                                             crawl configuration (can also be se
-                                            t via CRAWL_ID env var)
-                                              [string] [default: "7760c6c5f6ca"]
-      --newContext                          Deprecated as of 0.8.0, any values p
-                                            assed will be ignored
-                                                        [string] [default: null]
+                                            t via CRAWL_ID env var, defaults to
+                                            hostname)                   [string]
      --waitUntil                           Puppeteer page.goto() condition to w
                                            ait for before continuing, can be mu
                                            ltiple separated by ','
-                                                  [default: "load,networkidle2"]
+   [array] [choices: "load", "domcontentloaded", "networkidle0", "networkidle2"]
+                                              [default: ["load","networkidle2"]]
      --depth                               The depth of the crawl for all seeds
                                                          [number] [default: -1]
      --extraHops                           Number of extra 'hops' to follow, be
@ -137,15 +135,16 @@ Options:
      --logging                             Logging options for crawler, can inc
                                            lude: stats (enabled by default), js
                                            errors, pywb, debug
-                                                     [string] [default: "stats"]
+                                                    [array] [default: ["stats"]]
      --logLevel                            Comma-separated list of log levels t
                                            o include in logs
-                                                          [string] [default: ""]
+                                                           [array] [default: []]
      --context                             Comma-separated list of contexts to
-                                            include in logs
-                                                          [string] [default: ""]
-      --text                                If set, extract text to the pages.js
-                                            onl file  [boolean] [default: false]
+                                            include in logs[array] [default: []]
+      --text                                Extract initial (default) or final t
+                                            ext to pages.jsonl or WARC resource
+                                            record(s)
+                       [array] [choices: "to-pages", "to-warc", "final-to-warc"]
      --cwd                                 Crawl working directory for captures
                                             (pywb root). If not set, defaults t
                                            o process.cwd()
@ -171,7 +170,7 @@ Options:
                                            o crawl working directory)
      --behaviors                           Which background behaviors to enable
                                             on each page
-                [string] [default: "autoplay,autofetch,autoscroll,siteSpecific"]
+         [array] [default: ["autoplay","autofetch","autoscroll","siteSpecific"]]
      --behaviorTimeout                     If >0, timeout (in seconds) for in-p
                                            age behavior will run on each page.
                                            If 0, a behavior can run until finis
@ -188,7 +187,7 @@ Options:
      --screenshot                          Screenshot options for crawler, can
                                            include: view, thumbnail, fullPage (
                                            comma-separated list)
-                                                          [string] [default: ""]
+                [array] [choices: "view", "thumbnail", "fullPage"] [default: []]
      --screencastPort                      If set to a non-zero value, starts a
                                            n HTTP server with screencast access
                                            ible on this port
@ -236,6 +235,7 @@ Options:
      --restartsOnError                     if set, assume will be restarted if
                                            interrupted, don't run post-crawl pr
                                            ocesses on interrupt
+                                                      [boolean] [default: false]
      --netIdleWait                         if set, wait for network idle after
                                            page load and after behaviors are do
                                            ne (in seconds). if -1 (default), de
@ -259,8 +259,15 @@ Options:
      --failOnFailedSeed                    If set, crawler will fail with exit
                                            code 1 if any seed fails
                                                      [boolean] [default: false]
+      --failOnFailedLimit                   If set, save state and exit if numbe
+                                            r of failed pages exceeds this value
+                                                           [number] [default: 0]
+      --customBehaviors                     injects a custom behavior file or se
+                                            t of behavior files in a directory
+      --debugAccessRedis                    if set, runs internal redis without
+                                            protected mode to allow external acc
+                                            ess (for debugging)        [boolean]
      --config                              Path to YAML config file
-
 ```
 </details>

@ -514,14 +521,28 @@ With version 0.8.0, Browsertrix Crawler includes the ability to take screenshots

 Three screenshot options are available:

- `--view`: Takes a png screenshot of the initially visible viewport (1920x1080)
- `--fullPage`: Takes a png screenshot of the full page
- `--thumbnail`: Takes a jpeg thumbnail of the initially visible viewport (1920x1080)
+- `--screenshot view`: Takes a png screenshot of the initially visible viewport (1920x1080)
+- `--screenshot fullPage`: Takes a png screenshot of the full page
+- `--screenshot thumbnail`: Takes a jpeg thumbnail of the initially visible viewport (1920x1080)

-These can be combined using a comma-separated list passed to the `--screenshot` option, e.g.: `--screenshot thumbnail,view,fullPage`.
+These can be combined using a comma-separated list passed to the `--screenshot` option, e.g.: `--screenshot thumbnail,view,fullPage` or passed in
+separately `--screenshot thumbnail --screenshot view --screenshot fullPage`.

 Screenshots are written into a `screenshots.warc.gz` WARC file in the `archives/` directory. If the `--generateWACZ` command line option is used, the screenshots WARC is written into the `archive` directory of the WACZ file and indexed alongside the other WARCs.

+### Text Extraction
+
+Browsertrix Crawler supports text extraction via the `--text` flag.
+
+With version 0.12.0, the `--text` flag accepts one or more of the following extraction options:
+
+- `--text to-pages` - Extract initial text and add it to the text field in pages.jsonl
+- `--text to-warc` - Extract initial page text and add it to a `urn:text:<url>` WARC resource record
+- `--text final-to-warc` - Extract the final page text after all behaviors have run and add it to a `urn:textFinal:<url>` WARC resource record
+
+The options can be separate or combined into a comma separate list, eg. `--text to-warc,final-to-warc` or `--text to-warc --text final-to-warc`
+are equivalent. For backwards compatibility, `--text` alone is equivalent to `--text to-pages`.
+
 ### Watching the crawl -- Screencasting

 With version 0.4.0, Browsertrix Crawler includes an experimental 'screencasting' option, which allows watching the crawl in real-time via screencast (connected via a websocket).
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.12.0-beta.2",
+  "version": "0.12.0",
  "main": "browsertrix-crawler",
  "type": "module",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
--- a/util/argParser.js
+++ b/util/argParser.js
@ -17,6 +17,10 @@ import { logger } from "./logger.js";
 // ============================================================================
 class ArgParser {
  get cliOpts() {
+    const coerce = array => {
+      return array.flatMap(v => v.split(",")).filter(x => !!x);
+    };
+
    return {
      "seeds": {
        alias: "url",
@ -40,14 +44,16 @@ class ArgParser {

      "crawlId": {
        alias: "id",
-        describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var)",
+        describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
        type: "string",
-        default: process.env.CRAWL_ID || os.hostname(),
      },

      "waitUntil": {
        describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
-        default: "load,networkidle2",
+        type: "array",
+        default: ["load", "networkidle2"],
+        choices: WAIT_UNTIL_OPTS,
+        coerce,
      },

      "depth": {
@ -173,25 +179,36 @@ class ArgParser {

      "logging": {
        describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
-        type: "string",
-        default: "stats",
+        type: "array",
+        default: ["stats"],
+        coerce,
      },

      "logLevel": {
        describe: "Comma-separated list of log levels to include in logs",
-        type: "string",
-        default: "",
+        type: "array",
+        default: [],
+        coerce,
      },

      "context": {
        describe: "Comma-separated list of contexts to include in logs",
-        type: "string",
-        default: "",
+        type: "array",
+        default: [],
+        coerce,
      },

      "text": {
-        describe: "If set, extract text to the pages.jsonl file",
-        type: "string",
+        describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
+        type: "array",
+        choices: EXTRACT_TEXT_TYPES,
+        coerce: (array) => {
+          // backwards compatibility: default --text true / --text -> --text to-pages
+          if (!array.length || (array.length === 1 && array[0] === "true")) {
+            return ["to-pages"];
+          }
+          return coerce(array);
+        }
      },

      "cwd": {
@ -231,8 +248,10 @@ class ArgParser {

      "behaviors": {
        describe: "Which background behaviors to enable on each page",
-        default: "autoplay,autofetch,autoscroll,siteSpecific",
-        type: "string",
+        type: "array",
+        default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
+        choices: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
+        coerce,
      },

      "behaviorTimeout": {
@ -261,9 +280,11 @@ class ArgParser {
      },

      "screenshot": {
-        describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage (comma-separated list)",
-        type: "string",
-        default: "",
+        describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage",
+        type: "array",
+        default: [],
+        choices: Array.from(Object.keys(screenshotTypes)),
+        coerce,
      },

      "screencastPort": {
@ -441,6 +462,7 @@ class ArgParser {
  }

  validateArgs(argv) {
+    argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
    argv.collection = interpolateFilename(argv.collection, argv.crawlId);

    // Check that the collection name is valid.
@ -448,42 +470,13 @@ class ArgParser {
      logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
    }

-    // waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
-    // can be multiple separate by comma
-    // (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
-    if (typeof argv.waitUntil != "object") {
-      argv.waitUntil = argv.waitUntil.split(",");
-    }
-
-    // split text options
-    if (argv.text === "" || argv.text === "true") {
-      argv.text = "to-pages";
-    }
-
-    argv.waitUntil = validateArrayOpts(argv.waitUntil, "waitUntil", WAIT_UNTIL_OPTS);
-
-    argv.screenshot = validateArrayOpts(argv.screenshot, "screenshot", Array.from(Object.keys(screenshotTypes)));
-
-    argv.text = validateArrayOpts(argv.text, "text", EXTRACT_TEXT_TYPES);
-
-    // log options
-    argv.logging = argv.logging.split(",");
-    argv.logLevel = argv.logLevel ? argv.logLevel.split(",") : [];
-    argv.context = argv.context ? argv.context.split(",") : [];
-
    // background behaviors to apply
    const behaviorOpts = {};
-    if (typeof argv.behaviors != "object"){
-      argv.behaviors = argv.behaviors.split(",");
-    }
    argv.behaviors.forEach((x) => behaviorOpts[x] = true);
    behaviorOpts.log = BEHAVIOR_LOG_FUNC;
    argv.behaviorOpts = JSON.stringify(behaviorOpts);

-    if (argv.newContext) {
-      logger.info("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
-    }
-
+    argv.text = argv.text || [];

    if (argv.mobileDevice) {
      argv.emulateDevice = devices[argv.mobileDevice.replace("-", " ")];
@ -560,30 +553,6 @@ class ArgParser {
  }
 }

-function validateArrayOpts(value, name, allowedValues) {
-  if (!value) {
-    return [];
-  }
-
-  if (value instanceof Array) {
-    return value;
-  }
-
-  if (typeof(value) !== "string") {
-    return [];
-  }
-
-  const arrayValue = value.split(",");
-
-  for (value of arrayValue) {
-    if (!allowedValues.includes(value)) {
-      logger.fatal(`Invalid value "${value}" for field "${name}": allowed values are: ${allowedValues.join(",")}`);
-    }
-  }
-
-  return arrayValue;
-}
-
 export function parseArgs(argv) {
  return new ArgParser().parseArgs(argv);
 }