mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
More flexible multi value arg parsing + README update for 0.12.0 (#422)
Updated arg parsing thanks to example in https://github.com/yargs/yargs/issues/846#issuecomment-517264899 to support multiple value arguments specified as either one string or multiple string using array type + coerce function. This allows for `choice` option to also be used to validate the options, when needed. With this setup, `--text to-pages,to-warc,final-to-warc`, `--text to-pages,to-warc --text final-to-warc` and `--text to-pages --text to-warc --text final-to-warc` all result in the same configuration! Updated other multiple choice args (waitUntil, logging, logLevel, context, behaviors, screenshot) to use the same system. Also updated README with new text extraction options and bumped version to 0.12.0 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
2aeda56d40
commit
15661eb9c8
4 changed files with 84 additions and 92 deletions
4
.github/workflows/ci.yaml
vendored
4
.github/workflows/ci.yaml
vendored
|
@ -1,6 +1,8 @@
|
|||
name: Node.js CI
|
||||
|
||||
on: [push]
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
|
|
61
README.md
61
README.md
|
@ -35,7 +35,7 @@ the following commands. Replace `[URL]` with the web site you'd like to crawl.
|
|||
|
||||
Here's how you can use some of the command-line options to configure the crawl:
|
||||
|
||||
- To include automated text extraction for full text search, add the `--text` flag.
|
||||
- To include automated text extraction for full text search to pages.jsonl, add the `--text` flag. To write extracted text to WARCs instead of or in addition to pages.jsonl, see Text Extraction below.
|
||||
|
||||
- To limit the crawl to a maximum number of pages, add `--pageLimit P` where P is the number of pages that will be crawled.
|
||||
|
||||
|
@ -68,15 +68,13 @@ Options:
|
|||
llel [number] [default: 1]
|
||||
--crawlId, --id A user provided ID for this crawl or
|
||||
crawl configuration (can also be se
|
||||
t via CRAWL_ID env var)
|
||||
[string] [default: "7760c6c5f6ca"]
|
||||
--newContext Deprecated as of 0.8.0, any values p
|
||||
assed will be ignored
|
||||
[string] [default: null]
|
||||
t via CRAWL_ID env var, defaults to
|
||||
hostname) [string]
|
||||
--waitUntil Puppeteer page.goto() condition to w
|
||||
ait for before continuing, can be mu
|
||||
ltiple separated by ','
|
||||
[default: "load,networkidle2"]
|
||||
[array] [choices: "load", "domcontentloaded", "networkidle0", "networkidle2"]
|
||||
[default: ["load","networkidle2"]]
|
||||
--depth The depth of the crawl for all seeds
|
||||
[number] [default: -1]
|
||||
--extraHops Number of extra 'hops' to follow, be
|
||||
|
@ -137,15 +135,16 @@ Options:
|
|||
--logging Logging options for crawler, can inc
|
||||
lude: stats (enabled by default), js
|
||||
errors, pywb, debug
|
||||
[string] [default: "stats"]
|
||||
[array] [default: ["stats"]]
|
||||
--logLevel Comma-separated list of log levels t
|
||||
o include in logs
|
||||
[string] [default: ""]
|
||||
[array] [default: []]
|
||||
--context Comma-separated list of contexts to
|
||||
include in logs
|
||||
[string] [default: ""]
|
||||
--text If set, extract text to the pages.js
|
||||
onl file [boolean] [default: false]
|
||||
include in logs[array] [default: []]
|
||||
--text Extract initial (default) or final t
|
||||
ext to pages.jsonl or WARC resource
|
||||
record(s)
|
||||
[array] [choices: "to-pages", "to-warc", "final-to-warc"]
|
||||
--cwd Crawl working directory for captures
|
||||
(pywb root). If not set, defaults t
|
||||
o process.cwd()
|
||||
|
@ -171,7 +170,7 @@ Options:
|
|||
o crawl working directory)
|
||||
--behaviors Which background behaviors to enable
|
||||
on each page
|
||||
[string] [default: "autoplay,autofetch,autoscroll,siteSpecific"]
|
||||
[array] [default: ["autoplay","autofetch","autoscroll","siteSpecific"]]
|
||||
--behaviorTimeout If >0, timeout (in seconds) for in-p
|
||||
age behavior will run on each page.
|
||||
If 0, a behavior can run until finis
|
||||
|
@ -188,7 +187,7 @@ Options:
|
|||
--screenshot Screenshot options for crawler, can
|
||||
include: view, thumbnail, fullPage (
|
||||
comma-separated list)
|
||||
[string] [default: ""]
|
||||
[array] [choices: "view", "thumbnail", "fullPage"] [default: []]
|
||||
--screencastPort If set to a non-zero value, starts a
|
||||
n HTTP server with screencast access
|
||||
ible on this port
|
||||
|
@ -236,6 +235,7 @@ Options:
|
|||
--restartsOnError if set, assume will be restarted if
|
||||
interrupted, don't run post-crawl pr
|
||||
ocesses on interrupt
|
||||
[boolean] [default: false]
|
||||
--netIdleWait if set, wait for network idle after
|
||||
page load and after behaviors are do
|
||||
ne (in seconds). if -1 (default), de
|
||||
|
@ -259,8 +259,15 @@ Options:
|
|||
--failOnFailedSeed If set, crawler will fail with exit
|
||||
code 1 if any seed fails
|
||||
[boolean] [default: false]
|
||||
--failOnFailedLimit If set, save state and exit if numbe
|
||||
r of failed pages exceeds this value
|
||||
[number] [default: 0]
|
||||
--customBehaviors injects a custom behavior file or se
|
||||
t of behavior files in a directory
|
||||
--debugAccessRedis if set, runs internal redis without
|
||||
protected mode to allow external acc
|
||||
ess (for debugging) [boolean]
|
||||
--config Path to YAML config file
|
||||
|
||||
```
|
||||
</details>
|
||||
|
||||
|
@ -514,14 +521,28 @@ With version 0.8.0, Browsertrix Crawler includes the ability to take screenshots
|
|||
|
||||
Three screenshot options are available:
|
||||
|
||||
- `--view`: Takes a png screenshot of the initially visible viewport (1920x1080)
|
||||
- `--fullPage`: Takes a png screenshot of the full page
|
||||
- `--thumbnail`: Takes a jpeg thumbnail of the initially visible viewport (1920x1080)
|
||||
- `--screenshot view`: Takes a png screenshot of the initially visible viewport (1920x1080)
|
||||
- `--screenshot fullPage`: Takes a png screenshot of the full page
|
||||
- `--screenshot thumbnail`: Takes a jpeg thumbnail of the initially visible viewport (1920x1080)
|
||||
|
||||
These can be combined using a comma-separated list passed to the `--screenshot` option, e.g.: `--screenshot thumbnail,view,fullPage`.
|
||||
These can be combined using a comma-separated list passed to the `--screenshot` option, e.g.: `--screenshot thumbnail,view,fullPage` or passed in
|
||||
separately `--screenshot thumbnail --screenshot view --screenshot fullPage`.
|
||||
|
||||
Screenshots are written into a `screenshots.warc.gz` WARC file in the `archives/` directory. If the `--generateWACZ` command line option is used, the screenshots WARC is written into the `archive` directory of the WACZ file and indexed alongside the other WARCs.
|
||||
|
||||
### Text Extraction
|
||||
|
||||
Browsertrix Crawler supports text extraction via the `--text` flag.
|
||||
|
||||
With version 0.12.0, the `--text` flag accepts one or more of the following extraction options:
|
||||
|
||||
- `--text to-pages` - Extract initial text and add it to the text field in pages.jsonl
|
||||
- `--text to-warc` - Extract initial page text and add it to a `urn:text:<url>` WARC resource record
|
||||
- `--text final-to-warc` - Extract the final page text after all behaviors have run and add it to a `urn:textFinal:<url>` WARC resource record
|
||||
|
||||
The options can be separate or combined into a comma separate list, eg. `--text to-warc,final-to-warc` or `--text to-warc --text final-to-warc`
|
||||
are equivalent. For backwards compatibility, `--text` alone is equivalent to `--text to-pages`.
|
||||
|
||||
### Watching the crawl -- Screencasting
|
||||
|
||||
With version 0.4.0, Browsertrix Crawler includes an experimental 'screencasting' option, which allows watching the crawl in real-time via screencast (connected via a websocket).
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.12.0-beta.2",
|
||||
"version": "0.12.0",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
|
@ -17,6 +17,10 @@ import { logger } from "./logger.js";
|
|||
// ============================================================================
|
||||
class ArgParser {
|
||||
get cliOpts() {
|
||||
const coerce = array => {
|
||||
return array.flatMap(v => v.split(",")).filter(x => !!x);
|
||||
};
|
||||
|
||||
return {
|
||||
"seeds": {
|
||||
alias: "url",
|
||||
|
@ -40,14 +44,16 @@ class ArgParser {
|
|||
|
||||
"crawlId": {
|
||||
alias: "id",
|
||||
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var)",
|
||||
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
|
||||
type: "string",
|
||||
default: process.env.CRAWL_ID || os.hostname(),
|
||||
},
|
||||
|
||||
"waitUntil": {
|
||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
||||
default: "load,networkidle2",
|
||||
type: "array",
|
||||
default: ["load", "networkidle2"],
|
||||
choices: WAIT_UNTIL_OPTS,
|
||||
coerce,
|
||||
},
|
||||
|
||||
"depth": {
|
||||
|
@ -173,25 +179,36 @@ class ArgParser {
|
|||
|
||||
"logging": {
|
||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
|
||||
type: "string",
|
||||
default: "stats",
|
||||
type: "array",
|
||||
default: ["stats"],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"logLevel": {
|
||||
describe: "Comma-separated list of log levels to include in logs",
|
||||
type: "string",
|
||||
default: "",
|
||||
type: "array",
|
||||
default: [],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"context": {
|
||||
describe: "Comma-separated list of contexts to include in logs",
|
||||
type: "string",
|
||||
default: "",
|
||||
type: "array",
|
||||
default: [],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"text": {
|
||||
describe: "If set, extract text to the pages.jsonl file",
|
||||
type: "string",
|
||||
describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||
type: "array",
|
||||
choices: EXTRACT_TEXT_TYPES,
|
||||
coerce: (array) => {
|
||||
// backwards compatibility: default --text true / --text -> --text to-pages
|
||||
if (!array.length || (array.length === 1 && array[0] === "true")) {
|
||||
return ["to-pages"];
|
||||
}
|
||||
return coerce(array);
|
||||
}
|
||||
},
|
||||
|
||||
"cwd": {
|
||||
|
@ -231,8 +248,10 @@ class ArgParser {
|
|||
|
||||
"behaviors": {
|
||||
describe: "Which background behaviors to enable on each page",
|
||||
default: "autoplay,autofetch,autoscroll,siteSpecific",
|
||||
type: "string",
|
||||
type: "array",
|
||||
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
|
||||
choices: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"behaviorTimeout": {
|
||||
|
@ -261,9 +280,11 @@ class ArgParser {
|
|||
},
|
||||
|
||||
"screenshot": {
|
||||
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage (comma-separated list)",
|
||||
type: "string",
|
||||
default: "",
|
||||
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage",
|
||||
type: "array",
|
||||
default: [],
|
||||
choices: Array.from(Object.keys(screenshotTypes)),
|
||||
coerce,
|
||||
},
|
||||
|
||||
"screencastPort": {
|
||||
|
@ -441,6 +462,7 @@ class ArgParser {
|
|||
}
|
||||
|
||||
validateArgs(argv) {
|
||||
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
|
||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||
|
||||
// Check that the collection name is valid.
|
||||
|
@ -448,42 +470,13 @@ class ArgParser {
|
|||
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
||||
}
|
||||
|
||||
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
|
||||
// can be multiple separate by comma
|
||||
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
||||
if (typeof argv.waitUntil != "object") {
|
||||
argv.waitUntil = argv.waitUntil.split(",");
|
||||
}
|
||||
|
||||
// split text options
|
||||
if (argv.text === "" || argv.text === "true") {
|
||||
argv.text = "to-pages";
|
||||
}
|
||||
|
||||
argv.waitUntil = validateArrayOpts(argv.waitUntil, "waitUntil", WAIT_UNTIL_OPTS);
|
||||
|
||||
argv.screenshot = validateArrayOpts(argv.screenshot, "screenshot", Array.from(Object.keys(screenshotTypes)));
|
||||
|
||||
argv.text = validateArrayOpts(argv.text, "text", EXTRACT_TEXT_TYPES);
|
||||
|
||||
// log options
|
||||
argv.logging = argv.logging.split(",");
|
||||
argv.logLevel = argv.logLevel ? argv.logLevel.split(",") : [];
|
||||
argv.context = argv.context ? argv.context.split(",") : [];
|
||||
|
||||
// background behaviors to apply
|
||||
const behaviorOpts = {};
|
||||
if (typeof argv.behaviors != "object"){
|
||||
argv.behaviors = argv.behaviors.split(",");
|
||||
}
|
||||
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||
|
||||
if (argv.newContext) {
|
||||
logger.info("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
|
||||
}
|
||||
|
||||
argv.text = argv.text || [];
|
||||
|
||||
if (argv.mobileDevice) {
|
||||
argv.emulateDevice = devices[argv.mobileDevice.replace("-", " ")];
|
||||
|
@ -560,30 +553,6 @@ class ArgParser {
|
|||
}
|
||||
}
|
||||
|
||||
function validateArrayOpts(value, name, allowedValues) {
|
||||
if (!value) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (value instanceof Array) {
|
||||
return value;
|
||||
}
|
||||
|
||||
if (typeof(value) !== "string") {
|
||||
return [];
|
||||
}
|
||||
|
||||
const arrayValue = value.split(",");
|
||||
|
||||
for (value of arrayValue) {
|
||||
if (!allowedValues.includes(value)) {
|
||||
logger.fatal(`Invalid value "${value}" for field "${name}": allowed values are: ${allowedValues.join(",")}`);
|
||||
}
|
||||
}
|
||||
|
||||
return arrayValue;
|
||||
}
|
||||
|
||||
export function parseArgs(argv) {
|
||||
return new ArgParser().parseArgs(argv);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue