More flexible multi value arg parsing + README update for 0.12.0 (#422)

Updated arg parsing thanks to example in
https://github.com/yargs/yargs/issues/846#issuecomment-517264899
to support multiple value arguments specified as either one string or
multiple string using array type + coerce function.

This allows for `choice` option to also be used to validate the options,
when needed.

With this setup, `--text to-pages,to-warc,final-to-warc`, `--text
to-pages,to-warc --text final-to-warc` and `--text to-pages --text
to-warc --text final-to-warc` all result in the same configuration!

Updated other multiple choice args (waitUntil, logging, logLevel, context, behaviors, screenshot) to use the same system.

Also updated README with new text extraction options and bumped version
to 0.12.0

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2023-11-02 11:47:37 -07:00 committed by GitHub
parent 2aeda56d40
commit 15661eb9c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 84 additions and 92 deletions

View file

@ -1,6 +1,8 @@
name: Node.js CI
on: [push]
on:
push:
pull_request:
jobs:
lint:

View file

@ -35,7 +35,7 @@ the following commands. Replace `[URL]` with the web site you'd like to crawl.
Here's how you can use some of the command-line options to configure the crawl:
- To include automated text extraction for full text search, add the `--text` flag.
- To include automated text extraction for full text search to pages.jsonl, add the `--text` flag. To write extracted text to WARCs instead of or in addition to pages.jsonl, see Text Extraction below.
- To limit the crawl to a maximum number of pages, add `--pageLimit P` where P is the number of pages that will be crawled.
@ -68,15 +68,13 @@ Options:
llel [number] [default: 1]
--crawlId, --id A user provided ID for this crawl or
crawl configuration (can also be se
t via CRAWL_ID env var)
[string] [default: "7760c6c5f6ca"]
--newContext Deprecated as of 0.8.0, any values p
assed will be ignored
[string] [default: null]
t via CRAWL_ID env var, defaults to
hostname) [string]
--waitUntil Puppeteer page.goto() condition to w
ait for before continuing, can be mu
ltiple separated by ','
[default: "load,networkidle2"]
[array] [choices: "load", "domcontentloaded", "networkidle0", "networkidle2"]
[default: ["load","networkidle2"]]
--depth The depth of the crawl for all seeds
[number] [default: -1]
--extraHops Number of extra 'hops' to follow, be
@ -137,15 +135,16 @@ Options:
--logging Logging options for crawler, can inc
lude: stats (enabled by default), js
errors, pywb, debug
[string] [default: "stats"]
[array] [default: ["stats"]]
--logLevel Comma-separated list of log levels t
o include in logs
[string] [default: ""]
[array] [default: []]
--context Comma-separated list of contexts to
include in logs
[string] [default: ""]
--text If set, extract text to the pages.js
onl file [boolean] [default: false]
include in logs[array] [default: []]
--text Extract initial (default) or final t
ext to pages.jsonl or WARC resource
record(s)
[array] [choices: "to-pages", "to-warc", "final-to-warc"]
--cwd Crawl working directory for captures
(pywb root). If not set, defaults t
o process.cwd()
@ -171,7 +170,7 @@ Options:
o crawl working directory)
--behaviors Which background behaviors to enable
on each page
[string] [default: "autoplay,autofetch,autoscroll,siteSpecific"]
[array] [default: ["autoplay","autofetch","autoscroll","siteSpecific"]]
--behaviorTimeout If >0, timeout (in seconds) for in-p
age behavior will run on each page.
If 0, a behavior can run until finis
@ -188,7 +187,7 @@ Options:
--screenshot Screenshot options for crawler, can
include: view, thumbnail, fullPage (
comma-separated list)
[string] [default: ""]
[array] [choices: "view", "thumbnail", "fullPage"] [default: []]
--screencastPort If set to a non-zero value, starts a
n HTTP server with screencast access
ible on this port
@ -236,6 +235,7 @@ Options:
--restartsOnError if set, assume will be restarted if
interrupted, don't run post-crawl pr
ocesses on interrupt
[boolean] [default: false]
--netIdleWait if set, wait for network idle after
page load and after behaviors are do
ne (in seconds). if -1 (default), de
@ -259,8 +259,15 @@ Options:
--failOnFailedSeed If set, crawler will fail with exit
code 1 if any seed fails
[boolean] [default: false]
--failOnFailedLimit If set, save state and exit if numbe
r of failed pages exceeds this value
[number] [default: 0]
--customBehaviors injects a custom behavior file or se
t of behavior files in a directory
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
ess (for debugging) [boolean]
--config Path to YAML config file
```
</details>
@ -514,14 +521,28 @@ With version 0.8.0, Browsertrix Crawler includes the ability to take screenshots
Three screenshot options are available:
- `--view`: Takes a png screenshot of the initially visible viewport (1920x1080)
- `--fullPage`: Takes a png screenshot of the full page
- `--thumbnail`: Takes a jpeg thumbnail of the initially visible viewport (1920x1080)
- `--screenshot view`: Takes a png screenshot of the initially visible viewport (1920x1080)
- `--screenshot fullPage`: Takes a png screenshot of the full page
- `--screenshot thumbnail`: Takes a jpeg thumbnail of the initially visible viewport (1920x1080)
These can be combined using a comma-separated list passed to the `--screenshot` option, e.g.: `--screenshot thumbnail,view,fullPage`.
These can be combined using a comma-separated list passed to the `--screenshot` option, e.g.: `--screenshot thumbnail,view,fullPage` or passed in
separately `--screenshot thumbnail --screenshot view --screenshot fullPage`.
Screenshots are written into a `screenshots.warc.gz` WARC file in the `archives/` directory. If the `--generateWACZ` command line option is used, the screenshots WARC is written into the `archive` directory of the WACZ file and indexed alongside the other WARCs.
### Text Extraction
Browsertrix Crawler supports text extraction via the `--text` flag.
With version 0.12.0, the `--text` flag accepts one or more of the following extraction options:
- `--text to-pages` - Extract initial text and add it to the text field in pages.jsonl
- `--text to-warc` - Extract initial page text and add it to a `urn:text:<url>` WARC resource record
- `--text final-to-warc` - Extract the final page text after all behaviors have run and add it to a `urn:textFinal:<url>` WARC resource record
The options can be separate or combined into a comma separate list, eg. `--text to-warc,final-to-warc` or `--text to-warc --text final-to-warc`
are equivalent. For backwards compatibility, `--text` alone is equivalent to `--text to-pages`.
### Watching the crawl -- Screencasting
With version 0.4.0, Browsertrix Crawler includes an experimental 'screencasting' option, which allows watching the crawl in real-time via screencast (connected via a websocket).

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.12.0-beta.2",
"version": "0.12.0",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -17,6 +17,10 @@ import { logger } from "./logger.js";
// ============================================================================
class ArgParser {
get cliOpts() {
const coerce = array => {
return array.flatMap(v => v.split(",")).filter(x => !!x);
};
return {
"seeds": {
alias: "url",
@ -40,14 +44,16 @@ class ArgParser {
"crawlId": {
alias: "id",
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var)",
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
type: "string",
default: process.env.CRAWL_ID || os.hostname(),
},
"waitUntil": {
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
default: "load,networkidle2",
type: "array",
default: ["load", "networkidle2"],
choices: WAIT_UNTIL_OPTS,
coerce,
},
"depth": {
@ -173,25 +179,36 @@ class ArgParser {
"logging": {
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
type: "string",
default: "stats",
type: "array",
default: ["stats"],
coerce,
},
"logLevel": {
describe: "Comma-separated list of log levels to include in logs",
type: "string",
default: "",
type: "array",
default: [],
coerce,
},
"context": {
describe: "Comma-separated list of contexts to include in logs",
type: "string",
default: "",
type: "array",
default: [],
coerce,
},
"text": {
describe: "If set, extract text to the pages.jsonl file",
type: "string",
describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
type: "array",
choices: EXTRACT_TEXT_TYPES,
coerce: (array) => {
// backwards compatibility: default --text true / --text -> --text to-pages
if (!array.length || (array.length === 1 && array[0] === "true")) {
return ["to-pages"];
}
return coerce(array);
}
},
"cwd": {
@ -231,8 +248,10 @@ class ArgParser {
"behaviors": {
describe: "Which background behaviors to enable on each page",
default: "autoplay,autofetch,autoscroll,siteSpecific",
type: "string",
type: "array",
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
choices: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
coerce,
},
"behaviorTimeout": {
@ -261,9 +280,11 @@ class ArgParser {
},
"screenshot": {
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage (comma-separated list)",
type: "string",
default: "",
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage",
type: "array",
default: [],
choices: Array.from(Object.keys(screenshotTypes)),
coerce,
},
"screencastPort": {
@ -441,6 +462,7 @@ class ArgParser {
}
validateArgs(argv) {
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
// Check that the collection name is valid.
@ -448,42 +470,13 @@ class ArgParser {
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
}
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
// can be multiple separate by comma
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
if (typeof argv.waitUntil != "object") {
argv.waitUntil = argv.waitUntil.split(",");
}
// split text options
if (argv.text === "" || argv.text === "true") {
argv.text = "to-pages";
}
argv.waitUntil = validateArrayOpts(argv.waitUntil, "waitUntil", WAIT_UNTIL_OPTS);
argv.screenshot = validateArrayOpts(argv.screenshot, "screenshot", Array.from(Object.keys(screenshotTypes)));
argv.text = validateArrayOpts(argv.text, "text", EXTRACT_TEXT_TYPES);
// log options
argv.logging = argv.logging.split(",");
argv.logLevel = argv.logLevel ? argv.logLevel.split(",") : [];
argv.context = argv.context ? argv.context.split(",") : [];
// background behaviors to apply
const behaviorOpts = {};
if (typeof argv.behaviors != "object"){
argv.behaviors = argv.behaviors.split(",");
}
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
argv.behaviorOpts = JSON.stringify(behaviorOpts);
if (argv.newContext) {
logger.info("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
}
argv.text = argv.text || [];
if (argv.mobileDevice) {
argv.emulateDevice = devices[argv.mobileDevice.replace("-", " ")];
@ -560,30 +553,6 @@ class ArgParser {
}
}
function validateArrayOpts(value, name, allowedValues) {
if (!value) {
return [];
}
if (value instanceof Array) {
return value;
}
if (typeof(value) !== "string") {
return [];
}
const arrayValue = value.split(",");
for (value of arrayValue) {
if (!allowedValues.includes(value)) {
logger.fatal(`Invalid value "${value}" for field "${name}": allowed values are: ${allowedValues.join(",")}`);
}
}
return arrayValue;
}
export function parseArgs(argv) {
return new ArgParser().parseArgs(argv);
}