crawler args typing (#680)

- Refactors args parsing so that `Crawler.params` is properly timed with CLI options + additions with `CrawlerArgs` type. - also adds typing to create-login-profile CLI options - validation still done w/o typing due to yargs limitations - tests: exclude slow page from tests for faster test runs
2025-10-19 14:33:17 +00:00 · 2024-09-05 18:10:27 -07:00 · 2024-09-05 18:10:27 -07:00 · 9c9643c24f
commit 9c9643c24f
parent 802a416c7e
14 changed files with 686 additions and 642 deletions
--- a/docs/docs/user-guide/cli-options.md
+++ b/docs/docs/user-guide/cli-options.md
@ -43,9 +43,9 @@ Options:
                                                                           tom"]
      --scopeIncludeRx, --include           Regex of page URLs that should be in
                                            cluded in the crawl (defaults to the
-                                             immediate directory of URL)
+                                             immediate directory of URL)[string]
      --scopeExcludeRx, --exclude           Regex of page URLs that should be ex
-                                            cluded from the crawl.
+                                            cluded from the crawl.      [string]
      --allowHashUrls                       Allow Hashtag URLs, useful for singl
                                            e-page-application crawling or when
                                            different hashtags load dynamic cont
@ -56,14 +56,14 @@ Options:
                                            an iframe      [array] [default: []]
      --blockMessage                        If specified, when a URL is blocked,
                                             a record with this error message is
-                                             added instead              [string]
+                                             added instead[string] [default: ""]
      --blockAds, --blockads                If set, block advertisements from be
                                            ing loaded (based on Stephen Black's
                                             blocklist)
                                                      [boolean] [default: false]
      --adBlockMessage                      If specified, when an ad is blocked,
                                             a record with this error message is
-                                             added instead              [string]
+                                             added instead[string] [default: ""]
  -c, --collection                          Collection name to crawl to (replay
                                            will be accessible under this name i
                                            n pywb preview)
@ -79,7 +79,7 @@ Options:
      ineWarc                                         [boolean] [default: false]
      --rolloverSize                        If set, declare the rollover size
                                                  [number] [default: 1000000000]
-      --generateWACZ, --generatewacz, --ge  If set, generate wacz
+      --generateWACZ, --generatewacz, --ge  If set, generate WACZ on disk
      nerateWacz                                      [boolean] [default: false]
      --logging                             Logging options for crawler, can inc
                                            lude: stats (enabled by default), js
@ -94,15 +94,15 @@ Options:
  , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
  ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
  orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
-                              inks", "sitemap", "replay", "proxy"] [default: []]
+                      inks", "sitemap", "wacz", "replay", "proxy"] [default: []]
      --logExcludeContext                   Comma-separated list of contexts to
                                            NOT include in logs
  [array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
  , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
  ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
  orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
-  inks", "sitemap", "replay", "proxy"] [default: ["recorderNetwork","jsError","s
-                                                                    creencast"]]
+  inks", "sitemap", "wacz", "replay", "proxy"] [default: ["recorderNetwork","jsE
+                                                            rror","screencast"]]
      --text                                Extract initial (default) or final t
                                            ext to pages.jsonl or WARC resource
                                            record(s)
@ -127,15 +127,15 @@ Options:
                                             those greater than or equal to (>=)
                                             provided ISO Date string (YYYY-MM-D
                                            D or YYYY-MM-DDTHH:MM:SS or partial
-                                            date)
+                                            date)                       [string]
      --sitemapToDate, --sitemapTo          If set, filter URLs from sitemaps to
                                             those less than or equal to (<=) pr
                                            ovided ISO Date string (YYYY-MM-DD o
                                            r YYYY-MM-DDTHH:MM:SS or partial dat
-                                            e)
+                                            e)                          [string]
      --statsFilename                       If set, output stats as JSON to this
                                             file. (Relative filename resolves t
-                                            o crawl working directory)
+                                            o crawl working directory)  [string]
      --behaviors                           Which background behaviors to enable
                                             on each page
  [array] [choices: "autoplay", "autofetch", "autoscroll", "siteSpecific"] [defa
@ -304,7 +304,7 @@ Options:
  --shutdownWait            Shutdown browser in interactive after this many seco
                            nds, if no pings received      [number] [default: 0]
  --profile                 Path or HTTP(S) URL to tar.gz file which contains th
-                            e browser profile directory                 [string]
+                            e browser profile directory   [string] [default: ""]
  --windowSize              Browser window dimensions, specified as: width,heigh
                            t                    [string] [default: "1360,1020"]
  --cookieDays              If >0, set all cookies, including session cookies, t
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -12,7 +12,7 @@ import {
  WorkerId,
 } from "./util/state.js";

-import { parseArgs } from "./util/argParser.js";
+import { CrawlerArgs, parseArgs } from "./util/argParser.js";

 import yaml from "js-yaml";

@ -52,7 +52,7 @@ import {
  SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
 } from "./util/constants.js";

-import { AdBlockRules, BlockRules } from "./util/blockrules.js";
+import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
 import { OriginOverride } from "./util/originoverride.js";

 import {
@ -107,8 +107,7 @@ type PageEntry = {

 // ============================================================================
 export class Crawler {
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  params: any;
+  params: CrawlerArgs;
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  origConfig: any;

@ -200,8 +199,8 @@ export class Crawler {

  constructor() {
    const args = this.parseArgs();
-    this.params = args.parsed;
-    this.origConfig = args.origConfig;
+    this.params = args as CrawlerArgs;
+    this.origConfig = this.params.origConfig;

    // root collections dir
    this.collDir = path.join(
@ -872,7 +871,7 @@ self.__bx_behaviors.selectMainBehavior();

        const result = await timedRun(
          directFetchCapture({ url, headers, cdp }),
-          this.params.timeout,
+          this.params.pageLoadTimeout,
          "Direct fetch of page URL timed out",
          logDetails,
          "fetch",
@ -1396,7 +1395,7 @@ self.__bx_behaviors.selectMainBehavior();

    if (this.params.blockRules && this.params.blockRules.length) {
      this.blockRules = new BlockRules(
-        this.params.blockRules,
+        this.params.blockRules as BlockRuleDecl[],
        this.captureBasePrefix,
        this.params.blockMessage,
      );
@ -1405,7 +1404,9 @@ self.__bx_behaviors.selectMainBehavior();
    this.screencaster = this.initScreenCaster();

    if (this.params.originOverride && this.params.originOverride.length) {
-      this.originOverride = new OriginOverride(this.params.originOverride);
+      this.originOverride = new OriginOverride(
+        this.params.originOverride as string[],
+      );
    }

    await this._addInitialSeeds();
@ -2183,7 +2184,7 @@ self.__bx_behaviors.selectMainBehavior();
          id: "pages",
          title,
        };
-        header.hasText = this.params.text.includes("to-pages");
+        header.hasText = this.params.text.includes("to-pages") + "";
        if (this.params.text.length) {
          logger.debug("Text Extraction: " + this.params.text.join(","));
        } else {
@ -2290,8 +2291,12 @@ self.__bx_behaviors.selectMainBehavior();
      return;
    }

-    const fromDate = this.params.sitemapFromDate;
-    const toDate = this.params.sitemapToDate;
+    const fromDate = this.params.sitemapFromDate
+      ? new Date(this.params.sitemapFromDate)
+      : undefined;
+    const toDate = this.params.sitemapToDate
+      ? new Date(this.params.sitemapToDate)
+      : undefined;
    const headers = this.headers;

    logger.info(
--- a/src/create-login-profile.ts
+++ b/src/create-login-profile.ts
@ -7,7 +7,7 @@ import http, { IncomingMessage, ServerResponse } from "http";
 import readline from "readline";
 import child_process from "child_process";

-import yargs, { Options } from "yargs";
+import yargs from "yargs";

 import { logger } from "./util/logger.js";

@ -35,8 +35,10 @@ const behaviors = fs.readFileSync(
  { encoding: "utf8" },
 );

-function cliOpts(): { [key: string]: Options } {
-  return {
+function initArgs() {
+  return yargs(process.argv)
+    .usage("browsertrix-crawler profile [options]")
+    .options({
      url: {
        describe: "The URL of the login page",
        type: "string",
@ -46,22 +48,27 @@ function cliOpts(): { [key: string]: Options } {
      user: {
        describe:
          "The username for the login. If not specified, will be prompted",
+        type: "string",
      },

      password: {
        describe:
          "The password for the login. If not specified, will be prompted (recommended)",
+        type: "string",
      },

      filename: {
        describe:
          "The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
+        type: "string",
        default: "/crawls/profiles/profile.tar.gz",
      },

      debugScreenshot: {
        describe:
          "If specified, take a screenshot after login and save as this filename",
+        type: "boolean",
+        default: false,
      },

      headless: {
@ -93,18 +100,19 @@ function cliOpts(): { [key: string]: Options } {
        describe:
          "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory",
        type: "string",
+        default: "",
      },

      windowSize: {
-      type: "string",
        describe: "Browser window dimensions, specified as: width,height",
+        type: "string",
        default: getDefaultWindowSize(),
      },

      cookieDays: {
-      type: "number",
        describe:
          "If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
+        type: "number",
        default: 7,
      },

@ -115,7 +123,8 @@ function cliOpts(): { [key: string]: Options } {
      },

      sshProxyPrivateKeyFile: {
-      describe: "path to SSH private key for SOCKS5 over SSH proxy connection",
+        describe:
+          "path to SSH private key for SOCKS5 over SSH proxy connection",
        type: "string",
      },

@ -124,7 +133,8 @@ function cliOpts(): { [key: string]: Options } {
          "path to SSH known hosts file for SOCKS5 over SSH proxy connection",
        type: "string",
      },
-  };
+    })
+    .parseSync();
 }

 function getDefaultWindowSize() {
@ -140,10 +150,7 @@ function handleTerminate(signame: string) {
 }

 async function main() {
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  const params: any = yargs(process.argv)
-    .usage("browsertrix-crawler profile [options]")
-    .option(cliOpts()).argv;
+  const params = initArgs();

  logger.setDebugLogging(true);

--- a/src/replaycrawler.ts
+++ b/src/replaycrawler.ts
@ -110,8 +110,8 @@ export class ReplayCrawler extends Crawler {

    this.infoWriter = null;

-    this.includeRx = parseRx(this.params.include);
-    this.excludeRx = parseRx(this.params.include);
+    this.includeRx = parseRx(this.params.scopeIncludeRx);
+    this.excludeRx = parseRx(this.params.scopeExcludeRx);
  }

  async bootstrap(): Promise<void> {
--- a/src/util/argParser.ts
+++ b/src/util/argParser.ts
@ -4,7 +4,7 @@ import os from "os";

 import yaml from "js-yaml";
 import { KnownDevices as devices } from "puppeteer-core";
-import yargs, { Options } from "yargs";
+import yargs from "yargs";
 import { hideBin } from "yargs/helpers";

 import {
@ -19,17 +19,38 @@ import { screenshotTypes } from "./screenshots.js";
 import {
  DEFAULT_EXCLUDE_LOG_CONTEXTS,
  LOG_CONTEXT_TYPES,
+  LogContext,
  logger,
 } from "./logger.js";
+import { SaveState } from "./state.js";
+
+// ============================================================================
+export type CrawlerArgs = ReturnType<typeof parseArgs> & {
+  logContext: LogContext[];
+  logExcludeContext: LogContext[];
+  text: string[];
+
+  scopedSeeds: ScopedSeed[];
+
+  crawlId: string;
+
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  origConfig: Record<string, any>;
+  state?: SaveState;
+
+  warcInfo?: Record<string, string>;
+};

 // ============================================================================
 class ArgParser {
-  get cliOpts(): { [key: string]: Options } {
-    const coerce = (array: string[]) => {
+  initArgs(argv: string[]) {
+    const coerce = (array: string[]): string[] => {
      return array.flatMap((v) => v.split(",")).filter((x) => !!x);
    };

-    return {
+    return yargs(hideBin(argv))
+      .usage("crawler [options]")
+      .options({
        seeds: {
          alias: "url",
          describe: "The URL to start crawling from",
@ -74,7 +95,8 @@ class ArgParser {
        },

        extraHops: {
-        describe: "Number of extra 'hops' to follow, beyond the current scope",
+          describe:
+            "Number of extra 'hops' to follow, beyond the current scope",
          default: 0,
          type: "number",
        },
@ -119,11 +141,14 @@ class ArgParser {
          alias: "include",
          describe:
            "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
+          type: "string",
        },

        scopeExcludeRx: {
          alias: "exclude",
-        describe: "Regex of page URLs that should be excluded from the crawl.",
+          describe:
+            "Regex of page URLs that should be excluded from the crawl.",
+          type: "string",
        },

        allowHashUrls: {
@ -142,6 +167,7 @@ class ArgParser {
          describe:
            "If specified, when a URL is blocked, a record with this error message is added instead",
          type: "string",
+          default: "",
        },

        blockAds: {
@ -156,6 +182,7 @@ class ArgParser {
          describe:
            "If specified, when an ad is blocked, a record with this error message is added instead",
          type: "string",
+          default: "",
        },

        collection: {
@ -289,15 +316,18 @@ class ArgParser {
          alias: "sitemapFrom",
          describe:
            "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
+          type: "string",
        },

        sitemapToDate: {
          alias: "sitemapTo",
          describe:
            "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
+          type: "string",
        },

        statsFilename: {
+          type: "string",
          describe:
            "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)",
        },
@ -584,7 +614,7 @@ class ArgParser {
            "path to SSH known hosts file for SOCKS5 over SSH proxy connection",
          type: "string",
        },
-    };
+      });
  }

  parseArgs(argvParams?: string[], isQA = false) {
@ -601,9 +631,7 @@ class ArgParser {

    let origConfig = {};

-    const parsed = yargs(hideBin(argv))
-      .usage("crawler [options]")
-      .option(this.cliOpts)
+    const parsed = this.initArgs(argv)
      .config(
        "config",
        "Path to YAML config file",
@ -616,9 +644,12 @@ class ArgParser {
          return origConfig;
        },
      )
-      .check((argv) => this.validateArgs(argv, isQA)).argv;
+      .check((argv) => this.validateArgs(argv, isQA))
+      .parseSync();

-    return { parsed, origConfig };
+    parsed.origConfig = origConfig;
+
+    return parsed;
  }

  splitCrawlArgsQuoteSafe(crawlArgs: string): string[] {
@ -629,8 +660,8 @@ class ArgParser {
  }

  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  validateArgs(argv: Record<string, any>, isQA: boolean) {
-    argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
+  validateArgs(argv: any, isQA: boolean) {
+    argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
    argv.collection = interpolateFilename(argv.collection, argv.crawlId);

    // Check that the collection name is valid.
@ -675,7 +706,8 @@ class ArgParser {

      for (const seed of urlSeedFileList) {
        if (seed) {
-          argv.seeds.push(seed);
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          (argv.seeds as any).push(seed);
        }
      }
    }
@ -689,7 +721,7 @@ class ArgParser {
      //logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
    }

-    argv.scopedSeeds = [];
+    const scopedSeeds: ScopedSeed[] = [];

    if (!isQA) {
      const scopeOpts = {
@ -701,24 +733,22 @@ class ArgParser {
        extraHops: argv.extraHops,
      };

-      for (let seed of argv.seeds) {
-        if (typeof seed === "string") {
-          seed = { url: seed };
-        }
+      for (const seed of argv.seeds) {
+        const newSeed = typeof seed === "string" ? { url: seed } : seed;

        try {
-          argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed }));
+          scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
          // eslint-disable-next-line @typescript-eslint/no-explicit-any
        } catch (e: any) {
          logger.error("Failed to create seed", {
            error: e.toString(),
            ...scopeOpts,
-            ...seed,
+            ...newSeed,
          });
          if (argv.failOnFailedSeed) {
            logger.fatal(
              "Invalid seed specified, aborting crawl",
-              { url: seed.url },
+              { url: newSeed.url },
              "general",
              1,
            );
@ -726,13 +756,15 @@ class ArgParser {
        }
      }

-      if (!argv.scopedSeeds.length) {
+      if (!scopedSeeds.length) {
        logger.fatal("No valid seeds specified, aborting crawl");
      }
    } else if (!argv.qaSource) {
      logger.fatal("--qaSource required for QA mode");
    }

+    argv.scopedSeeds = scopedSeeds;
+
    // Resolve statsFilename
    if (argv.statsFilename) {
      argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
--- a/src/util/blockrules.ts
+++ b/src/util/blockrules.ts
@ -18,7 +18,7 @@ const BlockState = {
  BLOCK_AD: "advertisement",
 };

-type BlockRuleDecl = {
+export type BlockRuleDecl = {
  url?: string;
  frameTextMatch?: string;
  inFrameUrl?: string;
--- a/src/util/state.ts
+++ b/src/util/state.ts
@ -149,7 +149,7 @@ declare module "ioredis" {
 }

 // ============================================================================
-type SaveState = {
+export type SaveState = {
  done?: number | string[];
  finished: string[];
  queued: string[];
--- a/tests/dryrun.test.js
+++ b/tests/dryrun.test.js
@ -3,7 +3,7 @@ import fs from "fs";

 test("ensure dryRun crawl only writes pages and logs", async () => {
  child_process.execSync(
-    'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ  --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
+    'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ  --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun --exclude community',
  );

  const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
--- a/tests/file_stats.test.js
+++ b/tests/file_stats.test.js
@ -3,7 +3,7 @@ import fs from "fs";

 test("ensure that stats file is modified", async () => {
  const child = child_process.exec(
-    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json",
+    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --exclude community --collection file-stats --statsFilename progress.json",
  );

  // detect crawler exit
--- a/tests/limit_reached.test.js
+++ b/tests/limit_reached.test.js
@ -6,7 +6,7 @@ const exec = util.promisify(execCallback);

 test("ensure page limit reached", async () => {
  execSync(
-    'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json',
+    'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json --exclude community',
  );
 });

--- a/tests/mult_url_crawl_with_favicon.test.js
+++ b/tests/mult_url_crawl_with_favicon.test.js
@ -6,7 +6,7 @@ const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...

 test("ensure multi url crawl run with docker run passes", async () => {
  child_process.execSync(
-    'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
+    'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2 --exclude community',
  );
 });

--- a/tests/rollover-writer.test.js
+++ b/tests/rollover-writer.test.js
@ -3,7 +3,7 @@ import fs from "fs";

 test("set rollover to 500K and ensure individual WARCs rollover, including screenshots", async () => {
  child_process.execSync(
-    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --collection rollover-500K --rolloverSize 500000 --screenshot view"
+    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --exclude community --collection rollover-500K --rolloverSize 500000 --screenshot view"
  );

  const warcLists = fs.readdirSync("test-crawls/collections/rollover-500K/archive");
--- a/tests/saved-state.test.js
+++ b/tests/saved-state.test.js
@ -53,7 +53,7 @@ test("check crawl interrupted + saved state written", async () => {

  try {
    containerId = execSync(
-      "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"",
+      "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\" --exclude community",
      { encoding: "utf-8" },
      //wait.callback,
    );
@ -129,7 +129,7 @@ test("check crawl restarted with saved state", async () => {

  try {
    containerId = execSync(
-      `docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`,
+      `docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors "" --exclude community`,
      { encoding: "utf-8" },
    );
  } catch (error) {
--- a/tests/scopes.test.js
+++ b/tests/scopes.test.js
@ -13,7 +13,7 @@ function getSeeds(config) {
  };

  const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
-  return res.parsed.scopedSeeds;
+  return res.scopedSeeds;
 }

 test("default scope", async () => {