2022-10-24 15:30:10 +02:00
import path from "path" ;
import fs from "fs" ;
import os from "os" ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2022-10-24 15:30:10 +02:00
import yaml from "js-yaml" ;
2023-04-26 15:41:35 -07:00
import { KnownDevices as devices } from "puppeteer-core" ;
2024-09-05 18:10:27 -07:00
import yargs from "yargs" ;
2022-10-24 15:30:10 +02:00
import { hideBin } from "yargs/helpers" ;
2021-06-23 19:36:32 -07:00
2024-11-08 08:04:41 -08:00
import { createParser } from "css-selector-parser" ;
2023-11-09 19:11:11 -05:00
import {
BEHAVIOR_LOG_FUNC ,
WAIT_UNTIL_OPTS ,
EXTRACT_TEXT_TYPES ,
2024-03-22 13:37:14 -07:00
SERVICE_WORKER_OPTS ,
2024-11-08 08:04:41 -08:00
DEFAULT_SELECTORS ,
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
BEHAVIOR_TYPES ,
2024-11-08 08:04:41 -08:00
ExtractSelector ,
2023-11-09 19:11:11 -05:00
} from "./constants.js" ;
2022-10-24 15:30:10 +02:00
import { ScopedSeed } from "./seeds.js" ;
import { interpolateFilename } from "./storage.js" ;
2022-12-21 12:06:13 -05:00
import { screenshotTypes } from "./screenshots.js" ;
2024-03-07 08:35:53 -08:00
import {
DEFAULT_EXCLUDE_LOG_CONTEXTS ,
LOG_CONTEXT_TYPES ,
2024-09-05 18:10:27 -07:00
LogContext ,
2024-03-07 08:35:53 -08:00
logger ,
} from "./logger.js" ;
2024-09-05 18:10:27 -07:00
import { SaveState } from "./state.js" ;
// ============================================================================
export type CrawlerArgs = ReturnType < typeof parseArgs > & {
logContext : LogContext [ ] ;
logExcludeContext : LogContext [ ] ;
text : string [ ] ;
scopedSeeds : ScopedSeed [ ] ;
2024-11-24 01:22:50 -08:00
customBehaviors : string [ ] ;
2024-11-08 08:04:41 -08:00
selectLinks : ExtractSelector [ ] ;
2024-09-05 18:10:27 -07:00
crawlId : string ;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
origConfig : Record < string , any > ;
state? : SaveState ;
warcInfo? : Record < string , string > ;
} ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2021-06-23 19:36:32 -07:00
// ============================================================================
class ArgParser {
2024-09-05 18:10:27 -07:00
initArgs ( argv : string [ ] ) {
const coerce = ( array : string [ ] ) : string [ ] = > {
2023-11-09 19:11:11 -05:00
return array . flatMap ( ( v ) = > v . split ( "," ) ) . filter ( ( x ) = > ! ! x ) ;
More flexible multi value arg parsing + README update for 0.12.0 (#422)
Updated arg parsing thanks to example in
https://github.com/yargs/yargs/issues/846#issuecomment-517264899
to support multiple value arguments specified as either one string or
multiple string using array type + coerce function.
This allows for `choice` option to also be used to validate the options,
when needed.
With this setup, `--text to-pages,to-warc,final-to-warc`, `--text
to-pages,to-warc --text final-to-warc` and `--text to-pages --text
to-warc --text final-to-warc` all result in the same configuration!
Updated other multiple choice args (waitUntil, logging, logLevel, context, behaviors, screenshot) to use the same system.
Also updated README with new text extraction options and bumped version
to 0.12.0
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-11-02 11:47:37 -07:00
} ;
2024-09-05 18:10:27 -07:00
return yargs ( hideBin ( argv ) )
. usage ( "crawler [options]" )
. options ( {
seeds : {
alias : "url" ,
describe : "The URL to start crawling from" ,
type : "array" ,
default : [ ] ,
} ,
seedFile : {
alias : [ "urlFile" ] ,
describe :
"If set, read a list of seed urls, one per line, from the specified" ,
type : "string" ,
} ,
workers : {
alias : "w" ,
describe : "The number of workers to run in parallel" ,
default : 1 ,
type : "number" ,
} ,
crawlId : {
alias : "id" ,
describe :
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)" ,
type : "string" ,
} ,
waitUntil : {
describe :
"Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','" ,
type : "array" ,
default : [ "load" , "networkidle2" ] ,
choices : WAIT_UNTIL_OPTS ,
coerce ,
} ,
depth : {
describe : "The depth of the crawl for all seeds" ,
default : - 1 ,
type : "number" ,
} ,
extraHops : {
describe :
"Number of extra 'hops' to follow, beyond the current scope" ,
default : 0 ,
type : "number" ,
} ,
pageLimit : {
alias : "limit" ,
describe : "Limit crawl to this number of pages" ,
default : 0 ,
type : "number" ,
} ,
maxPageLimit : {
describe :
"Maximum pages to crawl, overriding pageLimit if both are set" ,
default : 0 ,
type : "number" ,
} ,
pageLoadTimeout : {
alias : "timeout" ,
describe : "Timeout for each page to load (in seconds)" ,
default : 90 ,
type : "number" ,
} ,
scopeType : {
describe :
"A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes" ,
type : "string" ,
choices : [
"page" ,
"page-spa" ,
"prefix" ,
"host" ,
"domain" ,
"any" ,
"custom" ,
] ,
} ,
scopeIncludeRx : {
alias : "include" ,
describe :
"Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)" ,
type : "string" ,
} ,
scopeExcludeRx : {
alias : "exclude" ,
describe :
"Regex of page URLs that should be excluded from the crawl." ,
type : "string" ,
} ,
allowHashUrls : {
describe :
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" ,
} ,
2024-11-08 08:04:41 -08:00
selectLinks : {
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
alias : "linkSelector" ,
2024-11-08 08:04:41 -08:00
describe :
"One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]" ,
type : "array" ,
default : [ "a[href]->href" ] ,
coerce ,
} ,
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
clickSelector : {
describe :
"Selector for elements to click when using the autoclick behavior" ,
type : "string" ,
default : "a" ,
} ,
2024-09-05 18:10:27 -07:00
blockRules : {
describe :
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" ,
type : "array" ,
default : [ ] ,
} ,
blockMessage : {
describe :
"If specified, when a URL is blocked, a record with this error message is added instead" ,
type : "string" ,
default : "" ,
} ,
blockAds : {
alias : "blockads" ,
describe :
"If set, block advertisements from being loaded (based on Stephen Black's blocklist)" ,
type : "boolean" ,
default : false ,
} ,
adBlockMessage : {
describe :
"If specified, when an ad is blocked, a record with this error message is added instead" ,
type : "string" ,
default : "" ,
} ,
collection : {
alias : "c" ,
describe :
"Collection name to crawl to (replay will be accessible under this name in pywb preview)" ,
type : "string" ,
default : "crawl-@ts" ,
} ,
headless : {
describe : "Run in headless mode, otherwise start xvfb" ,
type : "boolean" ,
default : false ,
} ,
driver : {
2024-11-08 08:04:41 -08:00
describe : "Custom driver for the crawler, if any" ,
2024-09-05 18:10:27 -07:00
type : "string" ,
} ,
generateCDX : {
alias : [ "generatecdx" , "generateCdx" ] ,
describe :
"If set, generate index (CDXJ) for use with pywb after crawl is done" ,
type : "boolean" ,
default : false ,
} ,
combineWARC : {
alias : [ "combinewarc" , "combineWarc" ] ,
describe : "If set, combine the warcs" ,
type : "boolean" ,
default : false ,
} ,
rolloverSize : {
describe : "If set, declare the rollover size" ,
default : 1000000000 ,
type : "number" ,
} ,
generateWACZ : {
alias : [ "generatewacz" , "generateWacz" ] ,
describe : "If set, generate WACZ on disk" ,
type : "boolean" ,
default : false ,
} ,
logging : {
describe :
"Logging options for crawler, can include: stats (enabled by default), jserrors, debug" ,
type : "array" ,
default : [ "stats" ] ,
coerce ,
} ,
logLevel : {
describe : "Comma-separated list of log levels to include in logs" ,
type : "array" ,
default : [ ] ,
coerce ,
} ,
context : {
alias : "logContext" ,
describe : "Comma-separated list of contexts to include in logs" ,
type : "array" ,
default : [ ] ,
choices : LOG_CONTEXT_TYPES ,
coerce ,
} ,
logExcludeContext : {
describe : "Comma-separated list of contexts to NOT include in logs" ,
type : "array" ,
default : DEFAULT_EXCLUDE_LOG_CONTEXTS ,
choices : LOG_CONTEXT_TYPES ,
coerce ,
} ,
text : {
describe :
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)" ,
type : "array" ,
choices : EXTRACT_TEXT_TYPES ,
coerce : ( array ) = > {
// backwards compatibility: default --text true / --text -> --text to-pages
if ( ! array . length || ( array . length === 1 && array [ 0 ] === "true" ) ) {
return [ "to-pages" ] ;
}
if ( array . length === 1 && array [ 0 ] === "false" ) {
return [ ] ;
}
return coerce ( array ) ;
} ,
} ,
cwd : {
describe :
"Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()" ,
type : "string" ,
default : process . cwd ( ) ,
} ,
mobileDevice : {
describe :
"Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts" ,
type : "string" ,
} ,
userAgent : {
describe : "Override user-agent with specified string" ,
type : "string" ,
} ,
userAgentSuffix : {
describe :
"Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)" ,
type : "string" ,
} ,
useSitemap : {
alias : "sitemap" ,
describe :
"If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified" ,
} ,
sitemapFromDate : {
alias : "sitemapFrom" ,
describe :
"If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" ,
type : "string" ,
} ,
sitemapToDate : {
alias : "sitemapTo" ,
describe :
"If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" ,
type : "string" ,
} ,
statsFilename : {
type : "string" ,
describe :
"If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)" ,
} ,
behaviors : {
describe : "Which background behaviors to enable on each page" ,
type : "array" ,
default : [ "autoplay" , "autofetch" , "autoscroll" , "siteSpecific" ] ,
coerce ,
} ,
behaviorTimeout : {
describe :
"If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish." ,
default : 90 ,
type : "number" ,
} ,
postLoadDelay : {
describe :
"If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors" ,
default : 0 ,
type : "number" ,
} ,
pageExtraDelay : {
alias : "delay" ,
describe :
"If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page" ,
default : 0 ,
type : "number" ,
} ,
dedupPolicy : {
describe : "Deduplication policy" ,
default : "skip" ,
type : "string" ,
choices : [ "skip" , "revisit" , "keep" ] ,
} ,
profile : {
describe :
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory" ,
type : "string" ,
} ,
screenshot : {
describe :
2024-11-24 06:26:55 +01:00
"Screenshot options for crawler, can include: view, thumbnail, fullPage, fullPageFinal" ,
2024-09-05 18:10:27 -07:00
type : "array" ,
default : [ ] ,
choices : Array.from ( Object . keys ( screenshotTypes ) ) ,
coerce ,
} ,
screencastPort : {
describe :
"If set to a non-zero value, starts an HTTP server with screencast accessible on this port" ,
type : "number" ,
default : 0 ,
} ,
screencastRedis : {
describe :
"If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set" ,
type : "boolean" ,
default : false ,
} ,
warcInfo : {
alias : [ "warcinfo" ] ,
describe :
"Optional fields added to the warcinfo record in combined WARCs" ,
//type: "object"
} ,
redisStoreUrl : {
describe :
"If set, url for remote redis server to store state. Otherwise, using local redis instance" ,
type : "string" ,
default : "redis://localhost:6379/0" ,
} ,
saveState : {
describe :
"If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted" ,
type : "string" ,
default : "partial" ,
choices : [ "never" , "partial" , "always" ] ,
} ,
saveStateInterval : {
describe :
"If save state is set to 'always', also save state during the crawl at this interval (in seconds)" ,
type : "number" ,
default : 300 ,
} ,
saveStateHistory : {
describe :
"Number of save states to keep during the duration of a crawl" ,
type : "number" ,
default : 5 ,
} ,
sizeLimit : {
describe :
"If set, save state and exit if size limit exceeds this value" ,
type : "number" ,
default : 0 ,
} ,
diskUtilization : {
describe :
"If set, save state and exit if disk utilization exceeds this percentage value" ,
type : "number" ,
default : 90 ,
} ,
timeLimit : {
describe : "If set, save state and exit after time limit, in seconds" ,
type : "number" ,
default : 0 ,
} ,
healthCheckPort : {
describe : "port to run healthcheck on" ,
type : "number" ,
default : 0 ,
} ,
overwrite : {
describe :
"overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started" ,
type : "boolean" ,
default : false ,
} ,
waitOnDone : {
describe :
"if set, wait for interrupt signal when finished instead of exiting" ,
type : "boolean" ,
default : false ,
} ,
restartsOnError : {
describe :
"if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt" ,
type : "boolean" ,
default : false ,
} ,
netIdleWait : {
describe :
"if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope" ,
type : "number" ,
default : - 1 ,
} ,
lang : {
describe :
"if set, sets the language used by the browser, should be ISO 639 language[-country] code" ,
type : "string" ,
} ,
title : {
describe :
"If set, write supplied title into WACZ datapackage.json metadata" ,
type : "string" ,
} ,
description : {
alias : [ "desc" ] ,
describe :
"If set, write supplied description into WACZ datapackage.json metadata" ,
type : "string" ,
} ,
originOverride : {
describe :
"if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port" ,
type : "array" ,
default : [ ] ,
} ,
logErrorsToRedis : {
describe : "If set, write error messages to redis" ,
type : "boolean" ,
default : false ,
} ,
writePagesToRedis : {
describe : "If set, write page objects to redis" ,
type : "boolean" ,
default : false ,
} ,
failOnFailedSeed : {
describe :
"If set, crawler will fail with exit code 1 if any seed fails. When combined with --failOnInvalidStatus," +
"will result in crawl failing with exit code 1 if any seed has a 4xx/5xx response" ,
type : "boolean" ,
default : false ,
} ,
failOnFailedLimit : {
describe :
"If set, save state and exit if number of failed pages exceeds this value" ,
type : "number" ,
default : 0 ,
} ,
failOnInvalidStatus : {
describe :
"If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit" +
" or --failOnFailedSeed may result in crawl failing due to non-200 responses" ,
type : "boolean" ,
default : false ,
} ,
customBehaviors : {
describe :
2024-11-14 01:50:33 -05:00
"Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" +
" of behaviors, URL to Git repo of behaviors (prefixed with git+, optionally specify branch and" +
" relative path to a directory within repo as branch and path query parameters, e.g." +
' --customBehaviors "git+https://git.example.com/repo.git?branch=dev&path=some/dir"' ,
2024-11-04 23:30:53 -05:00
type : "array" ,
default : [ ] ,
2024-09-05 18:10:27 -07:00
} ,
debugAccessRedis : {
describe :
"if set, runs internal redis without protected mode to allow external access (for debugging)" ,
type : "boolean" ,
} ,
debugAccessBrowser : {
describe : "if set, allow debugging browser on port 9222 via CDP" ,
type : "boolean" ,
} ,
warcPrefix : {
describe :
"prefix for WARC files generated, including WARCs added to WACZ" ,
type : "string" ,
} ,
serviceWorker : {
alias : "sw" ,
describe :
"service worker handling: disabled, enabled, or disabled with custom profile" ,
choices : SERVICE_WORKER_OPTS ,
default : "disabled" ,
} ,
proxyServer : {
describe :
"if set, will use specified proxy server. Takes precedence over any env var proxy settings" ,
type : "string" ,
} ,
dryRun : {
describe :
"If true, no archive data is written to disk, only pages and logs (and optionally saved state)." ,
type : "boolean" ,
} ,
qaSource : {
describe : "Required for QA mode. Source (WACZ or multi WACZ) for QA" ,
type : "string" ,
} ,
qaDebugImageDiff : {
describe :
"if specified, will write crawl.png, replay.png and diff.png for each page where they're different" ,
type : "boolean" ,
} ,
sshProxyPrivateKeyFile : {
describe :
"path to SSH private key for SOCKS5 over SSH proxy connection" ,
type : "string" ,
} ,
sshProxyKnownHostsFile : {
describe :
"path to SSH known hosts file for SOCKS5 over SSH proxy connection" ,
type : "string" ,
} ,
} ) ;
2021-06-23 19:36:32 -07:00
}
2024-03-22 17:32:42 -07:00
parseArgs ( argvParams? : string [ ] , isQA = false ) {
2023-11-09 11:27:11 -08:00
let argv = argvParams || process . argv ;
2021-07-07 18:56:52 -04:00
2024-04-04 13:05:24 -07:00
const envArgs =
isQA && process . env . QA_ARGS
? process . env . QA_ARGS
: process . env . CRAWL_ARGS ;
if ( envArgs ) {
argv = argv . concat ( this . splitCrawlArgsQuoteSafe ( envArgs ) ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
}
let origConfig = { } ;
2024-09-05 18:10:27 -07:00
const parsed = this . initArgs ( argv )
2023-11-09 19:11:11 -05:00
. config (
"config" ,
"Path to YAML config file" ,
( configPath : string | number ) = > {
if ( configPath === "/crawls/stdin" ) {
configPath = process . stdin . fd ;
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
origConfig = yaml . load ( fs . readFileSync ( configPath , "utf8" ) ) as any ;
return origConfig ;
} ,
)
2024-09-05 18:10:27 -07:00
. check ( ( argv ) = > this . validateArgs ( argv , isQA ) )
. parseSync ( ) ;
parsed . origConfig = origConfig ;
2023-11-09 19:11:11 -05:00
2024-09-05 18:10:27 -07:00
return parsed ;
2021-06-23 19:36:32 -07:00
}
2021-07-07 18:56:52 -04:00
2023-11-09 19:11:11 -05:00
splitCrawlArgsQuoteSafe ( crawlArgs : string ) : string [ ] {
2023-05-30 22:06:44 -04:00
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
const regex = /"[^"]+"|[^\s]+/g ;
2023-11-09 11:27:11 -08:00
const res = crawlArgs . match ( regex ) ;
2023-11-09 19:11:11 -05:00
return res ? res . map ( ( e ) = > e . replace ( /"(.+)"/ , "$1" ) ) : [ ] ;
2023-05-30 22:06:44 -04:00
}
2021-06-23 19:36:32 -07:00
2023-11-09 11:27:11 -08:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
2024-09-05 18:10:27 -07:00
validateArgs ( argv : any , isQA : boolean ) {
argv . crawlId = argv . crawlId || process . env . CRAWL_ID || os . hostname ( ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
argv . collection = interpolateFilename ( argv . collection , argv . crawlId ) ;
2021-06-23 19:36:32 -07:00
// Check that the collection name is valid.
2023-11-09 19:11:11 -05:00
if ( argv . collection . search ( /^[\w][\w-]*$/ ) === - 1 ) {
logger . fatal (
` \ n ${ argv . collection } is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ] \ n ` ,
) ;
2021-06-23 19:36:32 -07:00
}
// background behaviors to apply
2023-11-09 19:11:11 -05:00
const behaviorOpts : { [ key : string ] : string | boolean } = { } ;
2024-08-27 16:20:19 -04:00
if ( argv . behaviors . length > 0 ) {
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
argv . behaviors . forEach ( ( x : string ) = > {
if ( BEHAVIOR_TYPES . includes ( x ) ) {
behaviorOpts [ x ] = true ;
} else {
logger . warn (
"Unknown behavior specified, ignoring" ,
{ behavior : x } ,
"behavior" ,
) ;
}
} ) ;
2024-08-27 16:20:19 -04:00
behaviorOpts . log = BEHAVIOR_LOG_FUNC ;
behaviorOpts . startEarly = true ;
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
behaviorOpts . clickSelector = argv . clickSelector ;
2024-08-27 16:20:19 -04:00
argv . behaviorOpts = JSON . stringify ( behaviorOpts ) ;
} else {
argv . behaviorOpts = "" ;
}
2021-06-23 19:36:32 -07:00
More flexible multi value arg parsing + README update for 0.12.0 (#422)
Updated arg parsing thanks to example in
https://github.com/yargs/yargs/issues/846#issuecomment-517264899
to support multiple value arguments specified as either one string or
multiple string using array type + coerce function.
This allows for `choice` option to also be used to validate the options,
when needed.
With this setup, `--text to-pages,to-warc,final-to-warc`, `--text
to-pages,to-warc --text final-to-warc` and `--text to-pages --text
to-warc --text final-to-warc` all result in the same configuration!
Updated other multiple choice args (waitUntil, logging, logLevel, context, behaviors, screenshot) to use the same system.
Also updated README with new text extraction options and bumped version
to 0.12.0
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-11-02 11:47:37 -07:00
argv . text = argv . text || [ ] ;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
2021-06-23 19:36:32 -07:00
if ( argv . mobileDevice ) {
2023-11-09 11:27:11 -08:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
2023-11-09 19:11:11 -05:00
argv . emulateDevice = ( devices as Record < string , any > ) [
argv . mobileDevice . replace ( "-" , " " )
] ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( ! argv . emulateDevice ) {
2022-12-15 12:38:41 -05:00
logger . fatal ( "Unknown device: " + argv . mobileDevice ) ;
2021-06-23 19:36:32 -07:00
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
} else {
2023-11-09 19:11:11 -05:00
argv . emulateDevice = { viewport : null } ;
2021-06-23 19:36:32 -07:00
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( argv . seedFile ) {
const urlSeedFile = fs . readFileSync ( argv . seedFile , "utf8" ) ;
const urlSeedFileList = urlSeedFile . split ( "\n" ) ;
2021-06-23 19:36:32 -07:00
2023-11-09 19:11:11 -05:00
if ( typeof argv . seeds === "string" ) {
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
argv . seeds = [ argv . seeds ] ;
}
for ( const seed of urlSeedFileList ) {
if ( seed ) {
2024-09-05 18:10:27 -07:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
( argv . seeds as any ) . push ( seed ) ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
}
2021-06-23 19:36:32 -07:00
}
}
2024-11-08 08:04:41 -08:00
let selectLinks : ExtractSelector [ ] ;
const parser = createParser ( ) ;
if ( argv . selectLinks ) {
selectLinks = argv . selectLinks . map ( ( x : string ) = > {
const parts = x . split ( "->" ) ;
const selector = parts [ 0 ] ;
const value = parts [ 1 ] || "" ;
const extract = parts . length > 1 ? value . replace ( "@" , "" ) : "href" ;
const isAttribute = value . startsWith ( "@" ) ;
try {
parser ( selector ) ;
} catch ( e ) {
logger . fatal ( "Invalid Link Extraction CSS Selector" , { selector } ) ;
}
return { selector , extract , isAttribute } ;
} ) ;
} else {
selectLinks = DEFAULT_SELECTORS ;
}
argv . selectLinks = selectLinks ;
2022-09-08 23:39:26 -07:00
if ( argv . netIdleWait === - 1 ) {
if ( argv . scopeType === "page" || argv . scopeType === "page-spa" ) {
argv . netIdleWait = 15 ;
} else {
argv . netIdleWait = 2 ;
}
2023-03-08 21:31:19 -05:00
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
2022-09-08 23:39:26 -07:00
}
2024-09-05 18:10:27 -07:00
const scopedSeeds : ScopedSeed [ ] = [ ] ;
2021-06-23 19:36:32 -07:00
2024-03-22 17:32:42 -07:00
if ( ! isQA ) {
const scopeOpts = {
scopeType : argv.scopeType ,
sitemap : argv.sitemap ,
include : argv.include ,
exclude : argv.exclude ,
depth : argv.depth ,
extraHops : argv.extraHops ,
} ;
2024-09-05 18:10:27 -07:00
for ( const seed of argv . seeds ) {
const newSeed = typeof seed === "string" ? { url : seed } : seed ;
2023-09-29 13:02:52 -04:00
2024-03-22 17:32:42 -07:00
try {
2024-09-05 18:10:27 -07:00
scopedSeeds . push ( new ScopedSeed ( { . . . scopeOpts , . . . newSeed } ) ) ;
2024-06-20 18:41:57 -07:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch ( e : any ) {
logger . error ( "Failed to create seed" , {
error : e.toString ( ) ,
. . . scopeOpts ,
2024-09-05 18:10:27 -07:00
. . . newSeed ,
2024-06-20 18:41:57 -07:00
} ) ;
2024-03-22 17:32:42 -07:00
if ( argv . failOnFailedSeed ) {
logger . fatal (
2024-05-15 14:02:33 -04:00
"Invalid seed specified, aborting crawl" ,
2024-09-05 18:10:27 -07:00
{ url : newSeed.url } ,
2024-05-15 14:02:33 -04:00
"general" ,
1 ,
2024-03-22 17:32:42 -07:00
) ;
}
2023-09-29 13:02:52 -04:00
}
}
2024-09-05 18:10:27 -07:00
if ( ! scopedSeeds . length ) {
2024-05-15 14:02:33 -04:00
logger . fatal ( "No valid seeds specified, aborting crawl" ) ;
2024-03-22 17:32:42 -07:00
}
} else if ( ! argv . qaSource ) {
2024-05-15 14:02:33 -04:00
logger . fatal ( "--qaSource required for QA mode" ) ;
2021-06-23 19:36:32 -07:00
}
2024-09-05 18:10:27 -07:00
argv . scopedSeeds = scopedSeeds ;
2021-06-23 19:36:32 -07:00
// Resolve statsFilename
if ( argv . statsFilename ) {
argv . statsFilename = path . resolve ( argv . cwd , argv . statsFilename ) ;
}
2023-11-09 19:11:11 -05:00
if ( argv . diskUtilization < 0 || argv . diskUtilization > 99 ) {
2023-03-31 12:35:18 -04:00
argv . diskUtilization = 90 ;
}
2021-06-23 19:36:32 -07:00
return true ;
}
}
2024-03-22 17:32:42 -07:00
export function parseArgs ( argv? : string [ ] , isQA = false ) {
return new ArgParser ( ) . parseArgs ( argv , isQA ) ;
2022-10-24 15:30:10 +02:00
}