browsertrix-crawler/util/argParser.js
Ilya Kreymer 473de8c49f
Scope Handling Improvements + Tests (#66)
* scope fixes:
- remove default prefix scopeType, ensure scope include and exclude take precedence
- add new 'custom' scopeType, when include or exclude are used
- use --scopeIncludeRx and --scopeExcludeRx for better consistency for scope include and exclude (also allow --include/--exclude)
- ensure per-seed scope include/exclude used when present, and scopeType set to 'custom'
- ensure default scope is set to 'prefix' if no scopeType and no include/exclude regexes specified
- rename --type to --scopeType in seed to maintain consistency
- add sitemap param as alias for useSitemap

tests: 
- add seed scope resolution tests for argParse, testing per-scope seed resolution, inheritance and overrides
- fix screencaster to use relative paths to work with tests
- ci: use yarn instead of npm

* update README with new flags

* bump version to 0.4.0-beta.3
2021-07-06 20:22:27 -07:00

345 lines
9.9 KiB
JavaScript

const path = require("path");
const fs = require("fs");
const child_process = require("child_process");
const yaml = require("js-yaml");
const puppeteer = require("puppeteer-core");
const { Cluster } = require("puppeteer-cluster");
const yargs = require("yargs/yargs");
const { hideBin } = require("yargs/helpers");
const { NewWindowPage} = require("./screencaster");
const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
const { ScopedSeed } = require("./seeds");
// ============================================================================
class ArgParser {
constructor(profileDir) {
this.profileDir = profileDir;
}
get cliOpts() {
return {
"seeds": {
alias: "url",
describe: "The URL to start crawling from",
type: "array",
default: [],
},
"seedFile": {
alias: ["urlFile"],
describe: "If set, read a list of seed urls, one per line, from the specified",
type: "string",
},
"workers": {
alias: "w",
describe: "The number of workers to run in parallel",
default: 1,
type: "number",
},
"newContext": {
describe: "The context for each new capture, can be a new: page, window, session or browser.",
default: "page",
type: "string"
},
"waitUntil": {
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
default: "load,networkidle0",
},
"depth": {
describe: "The depth of the crawl for all seeds",
default: -1,
type: "number",
},
"limit": {
describe: "Limit crawl to this number of pages",
default: 0,
type: "number",
},
"timeout": {
describe: "Timeout for each page to load (in seconds)",
default: 90,
type: "number",
},
"scopeType": {
describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
type: "string",
},
"scopeIncludeRx": {
alias: "include",
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
},
"scopeExcludeRx": {
alias: "exclude",
describe: "Regex of page URLs that should be excluded from the crawl."
},
"allowHashUrls": {
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
},
"collection": {
alias: "c",
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
type: "string",
default: `capture-${new Date().toISOString().slice(0,19)}`.replace(/:/g, "-")
},
"headless": {
describe: "Run in headless mode, otherwise start xvfb",
type: "boolean",
default: false,
},
"driver": {
describe: "JS driver for the crawler",
type: "string",
default: path.join(__dirname, "..", "defaultDriver.js"),
},
"generateCDX": {
alias: ["generatecdx", "generateCdx"],
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
type: "boolean",
default: false,
},
"combineWARC": {
alias: ["combinewarc", "combineWarc"],
describe: "If set, combine the warcs",
type: "boolean",
default: false,
},
"rolloverSize": {
describe: "If set, declare the rollover size",
default: 1000000000,
type: "number",
},
"generateWACZ": {
alias: ["generatewacz", "generateWacz"],
describe: "If set, generate wacz",
type: "boolean",
default: false,
},
"logging": {
describe: "Logging options for crawler, can include: stats, pywb, behaviors, behaviors-debug",
type: "string",
default: "stats",
},
"text": {
describe: "If set, extract text to the pages.jsonl file",
type: "boolean",
default: false,
},
"cwd": {
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
type: "string",
default: process.cwd(),
},
"mobileDevice": {
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
type: "string",
},
"userAgent": {
describe: "Override user-agent with specified string",
type: "string",
},
"userAgentSuffix": {
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
type: "string",
},
"useSitemap": {
alias: "sitemap",
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
},
"statsFilename": {
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
},
"behaviors": {
describe: "Which background behaviors to enable on each page",
default: "autoplay,autofetch,siteSpecific",
type: "string",
},
"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
},
"screencastPort": {
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
type: "number",
default: 0
},
};
}
parseArgs(argv) {
argv = argv || process.argv;
return yargs(hideBin(argv))
.usage("crawler [options]")
.option(this.cliOpts)
.config("config", "Path to YAML config file", (configPath) => {
if (configPath === "/crawls/stdin") {
configPath = process.stdin.fd;
}
return yaml.load(fs.readFileSync(configPath, "utf8"));
})
.check((argv) => this.validateArgs(argv))
.argv;
}
validateArgs(argv) {
// Check that the collection name is valid.
if (argv.collection.search(/^[\w][\w-]*$/) === -1){
throw new Error(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
}
argv.timeout *= 1000;
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
// can be multiple separate by comma
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
if (typeof argv.waitUntil != "object"){
argv.waitUntil = argv.waitUntil.split(",");
}
for (const opt of argv.waitUntil) {
if (!WAIT_UNTIL_OPTS.includes(opt)) {
throw new Error("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
}
}
// log options
argv.logging = argv.logging.split(",");
// background behaviors to apply
const behaviorOpts = {};
if (typeof argv.behaviors != "object"){
argv.behaviors = argv.behaviors.split(",");
}
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
if (argv.logging.includes("behaviors")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
} else if (argv.logging.includes("behaviors-debug")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
argv.behaviorsLogDebug = true;
}
argv.behaviorOpts = JSON.stringify(behaviorOpts);
if (!argv.newContext) {
argv.newContext = "page";
}
switch (argv.newContext) {
case "page":
argv.newContext = Cluster.CONCURRENCY_PAGE;
if (argv.screencastPort && argv.workers > 1) {
console.warn("Note: Screencast with >1 workers and default page context may only show one page at a time. To fix, add '--newContext window' to open each page in a new window");
}
break;
case "session":
argv.newContext = Cluster.CONCURRENCY_CONTEXT;
break;
case "browser":
argv.newContext = Cluster.CONCURRENCY_BROWSER;
break;
case "window":
argv.newContext = NewWindowPage;
break;
default:
throw new Error("Invalid newContext, must be one of: page, session, browser");
}
if (argv.mobileDevice) {
argv.emulateDevice = puppeteer.devices[argv.mobileDevice];
if (!argv.emulateDevice) {
throw new Error("Unknown device: " + argv.mobileDevice);
}
}
if (argv.seedFile) {
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");
if (typeof(argv.seeds) === "string") {
argv.seeds = [argv.seeds];
}
for (const seed of urlSeedFileList) {
if (seed) {
argv.seeds.push(seed);
}
}
}
if (argv.include || argv.exclude) {
if (argv.scopeType && argv.scopeType !== "custom") {
console.warn("You've specified a --scopeType and a --scopeIncludeRx or --scopeExcludeRx regex. The custom scope regex will take precedence, overriding the scopeType");
argv.scopeType = "custom";
}
}
const scopeOpts = {
scopeType: argv.scopeType,
sitemap: argv.sitemap,
include: argv.include,
exclude: argv.exclude,
depth: argv.depth,
};
argv.scopedSeeds = [];
for (let seed of argv.seeds) {
if (typeof(seed) === "string") {
seed = {url: seed};
}
argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed}));
}
// Resolve statsFilename
if (argv.statsFilename) {
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
}
if (argv.profile) {
child_process.execSync("tar xvfz " + argv.profile, {cwd: this.profileDir});
}
return true;
}
}
module.exports.parseArgs = function(profileDir, argv) {
return new ArgParser(profileDir).parseArgs(argv);
};