browsertrix-crawler/util/argParser.js

import path from "path";
import fs from "fs";
import os from "os";

import yaml from "js-yaml";
import { KnownDevices as devices } from "puppeteer-core";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";

import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js";
import { logger } from "./logger.js";


// ============================================================================
class ArgParser {
  get cliOpts() {
    const coerce = array => {
      return array.flatMap(v => v.split(",")).filter(x => !!x);
    };

    return {
      "seeds": {
        alias: "url",
        describe: "The URL to start crawling from",
        type: "array",
        default: [],
      },

      "seedFile": {
        alias: ["urlFile"],
        describe: "If set, read a list of seed urls, one per line, from the specified",
        type: "string",
      },

      "workers": {
        alias: "w",
        describe: "The number of workers to run in parallel",
        default: 1,
        type: "number",
      },

      "crawlId": {
        alias: "id",
        describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
        type: "string",
      },

      "waitUntil": {
        describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
        type: "array",
        default: ["load", "networkidle2"],
        choices: WAIT_UNTIL_OPTS,
        coerce,
      },

      "depth": {
        describe: "The depth of the crawl for all seeds",
        default: -1,
        type: "number",
      },

      "extraHops": {
        describe: "Number of extra 'hops' to follow, beyond the current scope",
        default: 0,
        type: "number"
      },

      "pageLimit": {
        alias: "limit",
        describe: "Limit crawl to this number of pages",
        default: 0,
        type: "number",
      },

      "maxPageLimit": {
        describe: "Maximum pages to crawl, overriding  pageLimit if both are set",
        default: 0,
        type: "number",
      },

      "pageLoadTimeout": {
        alias: "timeout",
        describe: "Timeout for each page to load (in seconds)",
        default: 90,
        type: "number",
      },

      "scopeType": {
        describe: "A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
        type: "string",
        choices: ["page", "page-spa", "prefix", "host", "domain", "any", "custom"]
      },

      "scopeIncludeRx": {
        alias: "include",
        describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
      },

      "scopeExcludeRx": {
        alias: "exclude",
        describe: "Regex of page URLs that should be excluded from the crawl."
      },

      "allowHashUrls": {
        describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
      },

      "blockRules": {
        describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
        type: "array",
        default: [],
      },

      "blockMessage": {
        describe: "If specified, when a URL is blocked, a record with this error message is added instead",
        type: "string",
      },

      "blockAds": {
        alias: "blockads",
        describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
        type: "boolean",
        default: false,
      },

      "adBlockMessage": {
        describe: "If specified, when an ad is blocked, a record with this error message is added instead",
        type: "string",
      },

      "collection": {
        alias: "c",
        describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
        type: "string",
        default: "crawl-@ts"
      },

      "headless": {
        describe: "Run in headless mode, otherwise start xvfb",
        type: "boolean",
        default: false,
      },

      "driver": {
        describe: "JS driver for the crawler",
        type: "string",
        default: "./defaultDriver.js",
      },

      "generateCDX": {
        alias: ["generatecdx", "generateCdx"],
        describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
        type: "boolean",
        default: false,
      },

      "combineWARC": {
        alias: ["combinewarc", "combineWarc"],
        describe: "If set, combine the warcs",
        type: "boolean",
        default: false,
      },

      "rolloverSize": {
        describe: "If set, declare the rollover size",
        default: 1000000000,
        type: "number",
      },

      "generateWACZ": {
        alias: ["generatewacz", "generateWacz"],
        describe: "If set, generate wacz",
        type: "boolean",
        default: false,
      },

      "logging": {
        describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
        type: "array",
        default: ["stats"],
        coerce,
      },

      "logLevel": {
        describe: "Comma-separated list of log levels to include in logs",
        type: "array",
        default: [],
        coerce,
      },

      "context": {
        describe: "Comma-separated list of contexts to include in logs",
        type: "array",
        default: [],
        coerce,
      },

      "text": {
        describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
        type: "array",
        choices: EXTRACT_TEXT_TYPES,
        coerce: (array) => {
          // backwards compatibility: default --text true / --text -> --text to-pages
          if (!array.length || (array.length === 1 && array[0] === "true")) {
            return ["to-pages"];
          }
          return coerce(array);
        }
      },

      "cwd": {
        describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
        type: "string",
        default: process.cwd(),
      },

      "mobileDevice": {
        describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
        type: "string",
      },

      "userAgent": {
        describe: "Override user-agent with specified string",
        type: "string",
      },

      "userAgentSuffix": {
        describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
        type: "string",
      },

      "useSitemap": {
        alias: "sitemap",
        describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
      },

      "sitemapFromDate": {
        alias: "sitemapFrom",
        describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
      },

      "statsFilename": {
        describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
      },

      "behaviors": {
        describe: "Which background behaviors to enable on each page",
        type: "array",
        default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
        choices: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
        coerce,
      },

      "behaviorTimeout": {
        describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
        default: 90,
        type: "number",
      },

      "pageExtraDelay": {
        alias: "delay",
        describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
        default: 0,
        type: "number",
      },

      "dedupPolicy": {
        describe: "Deduplication policy",
        default: "skip",
        type: "string",
        choices: ["skip", "revisit", "keep"],
      },

      "profile": {
        describe: "Path to tar.gz file which will be extracted and used as the browser profile",
        type: "string",
      },

      "screenshot": {
        describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage",
        type: "array",
        default: [],
        choices: Array.from(Object.keys(screenshotTypes)),
        coerce,
      },

      "screencastPort": {
        describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
        type: "number",
        default: 0
      },

      "screencastRedis": {
        describe: "If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
        type: "boolean",
        default: false
      },

      "warcInfo": {
        alias: ["warcinfo"],
        describe: "Optional fields added to the warcinfo record in combined WARCs",
        type: "object"
      },

      "redisStoreUrl": {
        describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
        type: "string",
        default: "redis://localhost:6379/0"
      },

      "saveState": {
        describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
        type: "string",
        default: "partial",
        choices: ["never", "partial", "always"]
      },

      "saveStateInterval": {
        describe: "If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
        type: "number",
        default: 300,
      },

      "saveStateHistory": {
        describe: "Number of save states to keep during the duration of a crawl",
        type: "number",
        default: 5,
      },

      "sizeLimit": {
        describe: "If set, save state and exit if size limit exceeds this value",
        type: "number",
        default: 0,
      },

      "diskUtilization": {
        describe: "If set, save state and exit if disk utilization exceeds this percentage value",
        type: "number",
        default: 90,
      },

      "timeLimit": {
        describe: "If set, save state and exit after time limit, in seconds",
        type: "number",
        default: 0,
      },

      "healthCheckPort": {
        describe: "port to run healthcheck on",
        type: "number",
        default: 0,
      },

      "overwrite": {
        describe: "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
        type: "boolean",
        default: false
      },

      "waitOnDone": {
        describe: "if set, wait for interrupt signal when finished instead of exiting",
        type: "boolean",
        default: false
      },

      "restartsOnError": {
        describe: "if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
        type: "boolean",
        default: false
      },

      "netIdleWait": {
        describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
        type: "number",
        default: -1
      },

      "lang": {
        describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code",
        type: "string"
      },

      "title": {
        describe: "If set, write supplied title into WACZ datapackage.json metadata",
        type: "string"
      },

      "description": {
        alias: ["desc"],
        describe: "If set, write supplied description into WACZ datapackage.json metadata",
        type: "string"
      },

      "originOverride": {
        describe: "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
        type: "array",
        default: [],
      },

      "logErrorsToRedis": {
        describe: "If set, write error messages to redis",
        type: "boolean",
        default: false,
      },

      "failOnFailedSeed": {
        describe: "If set, crawler will fail with exit code 1 if any seed fails",
        type: "boolean",
        default: false
      },

      "failOnFailedLimit": {
        describe: "If set, save state and exit if number of failed pages exceeds this value",
        type: "number",
        default: 0,
      },

      "customBehaviors": {
        describe: "injects a custom behavior file or set of behavior files in a directory",
        type: ["string"]
      },

      "debugAccessRedis": {
        describe: "if set, runs internal redis without protected mode to allow external access (for debugging)",
        type: "boolean",
      }
    };
  }

  parseArgs(argv) {
    argv = argv || process.argv;

    if (process.env.CRAWL_ARGS) {
      argv = argv.concat(this.splitCrawlArgsQuoteSafe(process.env.CRAWL_ARGS));
    }

    let origConfig = {};

    const parsed = yargs(hideBin(argv))
      .usage("crawler [options]")
      .option(this.cliOpts)
      .config("config", "Path to YAML config file", (configPath) => {
        if (configPath === "/crawls/stdin") {
          configPath = process.stdin.fd;
        }
        origConfig = yaml.load(fs.readFileSync(configPath, "utf8"));
        return origConfig;
      })
      .check((argv) => this.validateArgs(argv))
      .argv;

    return {parsed, origConfig};
  }

  splitCrawlArgsQuoteSafe(crawlArgs) {
    // Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
    const regex = /"[^"]+"|[^\s]+/g;
    return crawlArgs.match(regex).map(e => e.replace(/"(.+)"/, "$1"));
  }

  validateArgs(argv) {
    argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
    argv.collection = interpolateFilename(argv.collection, argv.crawlId);

    // Check that the collection name is valid.
    if (argv.collection.search(/^[\w][\w-]*$/) === -1){
      logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
    }

    // background behaviors to apply
    const behaviorOpts = {};
    argv.behaviors.forEach((x) => behaviorOpts[x] = true);
    behaviorOpts.log = BEHAVIOR_LOG_FUNC;
    argv.behaviorOpts = JSON.stringify(behaviorOpts);

    argv.text = argv.text || [];

    if (argv.mobileDevice) {
      argv.emulateDevice = devices[argv.mobileDevice.replace("-", " ")];
      if (!argv.emulateDevice) {
        logger.fatal("Unknown device: " + argv.mobileDevice);
      }
    } else {
      argv.emulateDevice = {viewport: null};
    }

    if (argv.seedFile) {
      const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
      const urlSeedFileList = urlSeedFile.split("\n");

      if (typeof(argv.seeds) === "string") {
        argv.seeds = [argv.seeds];
      }

      for (const seed of urlSeedFileList) {
        if (seed) {
          argv.seeds.push(seed);
        }
      }
    }

    if (argv.netIdleWait === -1) {
      if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
        argv.netIdleWait = 15;
      } else {
        argv.netIdleWait = 2;
      }
      //logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
    }

    const scopeOpts = {
      scopeType: argv.scopeType,
      sitemap: argv.sitemap,
      include: argv.include,
      exclude: argv.exclude,
      depth: argv.depth,
      extraHops: argv.extraHops,
    };

    argv.scopedSeeds = [];

    for (let seed of argv.seeds) {
      if (typeof(seed) === "string") {
        seed = {url: seed};
      }

      try {
        argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed}));
      } catch (e) {
        if (argv.failOnFailedSeed) {
          logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`);
        }
      }
    }

    if (!argv.scopedSeeds.length) {
      logger.fatal("No valid seeds specified, aborting crawl.");
    }

    // Resolve statsFilename
    if (argv.statsFilename) {
      argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
    }

    if ((argv.diskUtilization < 0 || argv.diskUtilization > 99)) {
      argv.diskUtilization = 90;
    }

    return true;
  }
}

export function parseArgs(argv) {
  return new ArgParser().parseArgs(argv);
}