import path from "path"; import fs from "fs"; import os from "os"; import yaml from "js-yaml"; import { KnownDevices as devices } from "puppeteer-core"; import yargs from "yargs"; import { hideBin } from "yargs/helpers"; import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; import { screenshotTypes } from "./screenshots.js"; import { logger } from "./logger.js"; // ============================================================================ class ArgParser { get cliOpts() { const coerce = array => { return array.flatMap(v => v.split(",")).filter(x => !!x); }; return { "seeds": { alias: "url", describe: "The URL to start crawling from", type: "array", default: [], }, "seedFile": { alias: ["urlFile"], describe: "If set, read a list of seed urls, one per line, from the specified", type: "string", }, "workers": { alias: "w", describe: "The number of workers to run in parallel", default: 1, type: "number", }, "crawlId": { alias: "id", describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)", type: "string", }, "waitUntil": { describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','", type: "array", default: ["load", "networkidle2"], choices: WAIT_UNTIL_OPTS, coerce, }, "depth": { describe: "The depth of the crawl for all seeds", default: -1, type: "number", }, "extraHops": { describe: "Number of extra 'hops' to follow, beyond the current scope", default: 0, type: "number" }, "pageLimit": { alias: "limit", describe: "Limit crawl to this number of pages", default: 0, type: "number", }, "maxPageLimit": { describe: "Maximum pages to crawl, overriding pageLimit if both are set", default: 0, type: "number", }, "pageLoadTimeout": { alias: "timeout", describe: "Timeout for each page to load (in seconds)", default: 90, type: "number", }, "scopeType": { describe: "A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes", type: "string", choices: ["page", "page-spa", "prefix", "host", "domain", "any", "custom"] }, "scopeIncludeRx": { alias: "include", describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)", }, "scopeExcludeRx": { alias: "exclude", describe: "Regex of page URLs that should be excluded from the crawl." }, "allowHashUrls": { describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content", }, "blockRules": { describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe", type: "array", default: [], }, "blockMessage": { describe: "If specified, when a URL is blocked, a record with this error message is added instead", type: "string", }, "blockAds": { alias: "blockads", describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)", type: "boolean", default: false, }, "adBlockMessage": { describe: "If specified, when an ad is blocked, a record with this error message is added instead", type: "string", }, "collection": { alias: "c", describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)", type: "string", default: "crawl-@ts" }, "headless": { describe: "Run in headless mode, otherwise start xvfb", type: "boolean", default: false, }, "driver": { describe: "JS driver for the crawler", type: "string", default: "./defaultDriver.js", }, "generateCDX": { alias: ["generatecdx", "generateCdx"], describe: "If set, generate index (CDXJ) for use with pywb after crawl is done", type: "boolean", default: false, }, "combineWARC": { alias: ["combinewarc", "combineWarc"], describe: "If set, combine the warcs", type: "boolean", default: false, }, "rolloverSize": { describe: "If set, declare the rollover size", default: 1000000000, type: "number", }, "generateWACZ": { alias: ["generatewacz", "generateWacz"], describe: "If set, generate wacz", type: "boolean", default: false, }, "logging": { describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug", type: "array", default: ["stats"], coerce, }, "logLevel": { describe: "Comma-separated list of log levels to include in logs", type: "array", default: [], coerce, }, "context": { describe: "Comma-separated list of contexts to include in logs", type: "array", default: [], coerce, }, "text": { describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)", type: "array", choices: EXTRACT_TEXT_TYPES, coerce: (array) => { // backwards compatibility: default --text true / --text -> --text to-pages if (!array.length || (array.length === 1 && array[0] === "true")) { return ["to-pages"]; } return coerce(array); } }, "cwd": { describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()", type: "string", default: process.cwd(), }, "mobileDevice": { describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts", type: "string", }, "userAgent": { describe: "Override user-agent with specified string", type: "string", }, "userAgentSuffix": { describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)", type: "string", }, "useSitemap": { alias: "sitemap", describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified", }, "sitemapFromDate": { alias: "sitemapFrom", describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", }, "statsFilename": { describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)" }, "behaviors": { describe: "Which background behaviors to enable on each page", type: "array", default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"], choices: ["autoplay", "autofetch", "autoscroll", "siteSpecific"], coerce, }, "behaviorTimeout": { describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.", default: 90, type: "number", }, "pageExtraDelay": { alias: "delay", describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page", default: 0, type: "number", }, "dedupPolicy": { describe: "Deduplication policy", default: "skip", type: "string", choices: ["skip", "revisit", "keep"], }, "profile": { describe: "Path to tar.gz file which will be extracted and used as the browser profile", type: "string", }, "screenshot": { describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage", type: "array", default: [], choices: Array.from(Object.keys(screenshotTypes)), coerce, }, "screencastPort": { describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port", type: "number", default: 0 }, "screencastRedis": { describe: "If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set", type: "boolean", default: false }, "warcInfo": { alias: ["warcinfo"], describe: "Optional fields added to the warcinfo record in combined WARCs", type: "object" }, "redisStoreUrl": { describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store", type: "string", default: "redis://localhost:6379/0" }, "saveState": { describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted", type: "string", default: "partial", choices: ["never", "partial", "always"] }, "saveStateInterval": { describe: "If save state is set to 'always', also save state during the crawl at this interval (in seconds)", type: "number", default: 300, }, "saveStateHistory": { describe: "Number of save states to keep during the duration of a crawl", type: "number", default: 5, }, "sizeLimit": { describe: "If set, save state and exit if size limit exceeds this value", type: "number", default: 0, }, "diskUtilization": { describe: "If set, save state and exit if disk utilization exceeds this percentage value", type: "number", default: 90, }, "timeLimit": { describe: "If set, save state and exit after time limit, in seconds", type: "number", default: 0, }, "healthCheckPort": { describe: "port to run healthcheck on", type: "number", default: 0, }, "overwrite": { describe: "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started", type: "boolean", default: false }, "waitOnDone": { describe: "if set, wait for interrupt signal when finished instead of exiting", type: "boolean", default: false }, "restartsOnError": { describe: "if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt", type: "boolean", default: false }, "netIdleWait": { describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope", type: "number", default: -1 }, "lang": { describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code", type: "string" }, "title": { describe: "If set, write supplied title into WACZ datapackage.json metadata", type: "string" }, "description": { alias: ["desc"], describe: "If set, write supplied description into WACZ datapackage.json metadata", type: "string" }, "originOverride": { describe: "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port", type: "array", default: [], }, "logErrorsToRedis": { describe: "If set, write error messages to redis", type: "boolean", default: false, }, "failOnFailedSeed": { describe: "If set, crawler will fail with exit code 1 if any seed fails", type: "boolean", default: false }, "failOnFailedLimit": { describe: "If set, save state and exit if number of failed pages exceeds this value", type: "number", default: 0, }, "customBehaviors": { describe: "injects a custom behavior file or set of behavior files in a directory", type: ["string"] }, "debugAccessRedis": { describe: "if set, runs internal redis without protected mode to allow external access (for debugging)", type: "boolean", } }; } parseArgs(argv) { argv = argv || process.argv; if (process.env.CRAWL_ARGS) { argv = argv.concat(this.splitCrawlArgsQuoteSafe(process.env.CRAWL_ARGS)); } let origConfig = {}; const parsed = yargs(hideBin(argv)) .usage("crawler [options]") .option(this.cliOpts) .config("config", "Path to YAML config file", (configPath) => { if (configPath === "/crawls/stdin") { configPath = process.stdin.fd; } origConfig = yaml.load(fs.readFileSync(configPath, "utf8")); return origConfig; }) .check((argv) => this.validateArgs(argv)) .argv; return {parsed, origConfig}; } splitCrawlArgsQuoteSafe(crawlArgs) { // Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes const regex = /"[^"]+"|[^\s]+/g; return crawlArgs.match(regex).map(e => e.replace(/"(.+)"/, "$1")); } validateArgs(argv) { argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname; argv.collection = interpolateFilename(argv.collection, argv.crawlId); // Check that the collection name is valid. if (argv.collection.search(/^[\w][\w-]*$/) === -1){ logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`); } // background behaviors to apply const behaviorOpts = {}; argv.behaviors.forEach((x) => behaviorOpts[x] = true); behaviorOpts.log = BEHAVIOR_LOG_FUNC; argv.behaviorOpts = JSON.stringify(behaviorOpts); argv.text = argv.text || []; if (argv.mobileDevice) { argv.emulateDevice = devices[argv.mobileDevice.replace("-", " ")]; if (!argv.emulateDevice) { logger.fatal("Unknown device: " + argv.mobileDevice); } } else { argv.emulateDevice = {viewport: null}; } if (argv.seedFile) { const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8"); const urlSeedFileList = urlSeedFile.split("\n"); if (typeof(argv.seeds) === "string") { argv.seeds = [argv.seeds]; } for (const seed of urlSeedFileList) { if (seed) { argv.seeds.push(seed); } } } if (argv.netIdleWait === -1) { if (argv.scopeType === "page" || argv.scopeType === "page-spa") { argv.netIdleWait = 15; } else { argv.netIdleWait = 2; } //logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`); } const scopeOpts = { scopeType: argv.scopeType, sitemap: argv.sitemap, include: argv.include, exclude: argv.exclude, depth: argv.depth, extraHops: argv.extraHops, }; argv.scopedSeeds = []; for (let seed of argv.seeds) { if (typeof(seed) === "string") { seed = {url: seed}; } try { argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed})); } catch (e) { if (argv.failOnFailedSeed) { logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`); } } } if (!argv.scopedSeeds.length) { logger.fatal("No valid seeds specified, aborting crawl."); } // Resolve statsFilename if (argv.statsFilename) { argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename); } if ((argv.diskUtilization < 0 || argv.diskUtilization > 99)) { argv.diskUtilization = 90; } return true; } } export function parseArgs(argv) { return new ArgParser().parseArgs(argv); }