2022-10-24 15:30:10 +02:00
import path from "path" ;
import fs from "fs" ;
import os from "os" ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2022-10-24 15:30:10 +02:00
import yaml from "js-yaml" ;
2023-04-26 15:41:35 -07:00
import { KnownDevices as devices } from "puppeteer-core" ;
2022-10-24 15:30:10 +02:00
import yargs from "yargs" ;
import { hideBin } from "yargs/helpers" ;
2021-06-23 19:36:32 -07:00
2022-10-24 15:30:10 +02:00
import { BEHAVIOR _LOG _FUNC , WAIT _UNTIL _OPTS } from "./constants.js" ;
import { ScopedSeed } from "./seeds.js" ;
import { interpolateFilename } from "./storage.js" ;
2022-12-21 12:06:13 -05:00
import { screenshotTypes } from "./screenshots.js" ;
2023-03-17 14:24:44 -07:00
import { logger } from "./logger.js" ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2021-06-23 19:36:32 -07:00
// ============================================================================
class ArgParser {
get cliOpts ( ) {
return {
"seeds" : {
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
alias : "url" ,
2021-06-23 19:36:32 -07:00
describe : "The URL to start crawling from" ,
type : "array" ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
default : [ ] ,
2021-06-23 19:36:32 -07:00
} ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
"seedFile" : {
alias : [ "urlFile" ] ,
describe : "If set, read a list of seed urls, one per line, from the specified" ,
type : "string" ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"workers" : {
alias : "w" ,
describe : "The number of workers to run in parallel" ,
default : 1 ,
type : "number" ,
} ,
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
"crawlId" : {
alias : "id" ,
describe : "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var)" ,
type : "string" ,
default : process . env . CRAWL _ID || os . hostname ( ) ,
} ,
2021-06-23 19:36:32 -07:00
"newContext" : {
2022-12-21 12:06:13 -05:00
describe : "Deprecated as of 0.8.0, any values passed will be ignored" ,
default : null ,
2021-06-23 19:36:32 -07:00
type : "string"
} ,
"waitUntil" : {
2023-04-26 15:41:35 -07:00
describe : "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','" ,
default : "load,networkidle2" ,
2021-06-23 19:36:32 -07:00
} ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
"depth" : {
describe : "The depth of the crawl for all seeds" ,
default : - 1 ,
type : "number" ,
} ,
2022-01-15 09:03:09 -08:00
"extraHops" : {
describe : "Number of extra 'hops' to follow, beyond the current scope" ,
default : 0 ,
type : "number"
} ,
2023-04-03 11:10:47 -07:00
"pageLimit" : {
alias : "limit" ,
2021-06-23 19:36:32 -07:00
describe : "Limit crawl to this number of pages" ,
default : 0 ,
type : "number" ,
} ,
2023-04-03 11:10:47 -07:00
"maxPageLimit" : {
describe : "Maximum pages to crawl, overriding pageLimit if both are set" ,
default : 0 ,
type : "number" ,
} ,
2023-03-22 14:50:18 -04:00
"pageLoadTimeout" : {
alias : "timeout" ,
2021-06-23 19:36:32 -07:00
describe : "Timeout for each page to load (in seconds)" ,
default : 90 ,
type : "number" ,
} ,
"scopeType" : {
2023-02-24 19:09:40 +00:00
describe : "A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes" ,
2021-06-23 19:36:32 -07:00
type : "string" ,
2022-03-06 14:46:14 -08:00
choices : [ "page" , "page-spa" , "prefix" , "host" , "domain" , "any" , "custom" ]
2021-06-23 19:36:32 -07:00
} ,
2021-07-06 20:22:27 -07:00
"scopeIncludeRx" : {
alias : "include" ,
describe : "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)" ,
} ,
"scopeExcludeRx" : {
alias : "exclude" ,
2021-06-23 19:36:32 -07:00
describe : "Regex of page URLs that should be excluded from the crawl."
} ,
"allowHashUrls" : {
describe : "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" ,
} ,
2021-07-19 15:49:43 -07:00
"blockRules" : {
describe : "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" ,
type : "array" ,
default : [ ] ,
} ,
"blockMessage" : {
describe : "If specified, when a URL is blocked, a record with this error message is added instead" ,
type : "string" ,
} ,
2022-10-25 10:53:32 -04:00
"blockAds" : {
alias : "blockads" ,
describe : "If set, block advertisements from being loaded (based on Stephen Black's blocklist)" ,
type : "boolean" ,
default : false ,
} ,
"adBlockMessage" : {
describe : "If specified, when an ad is blocked, a record with this error message is added instead" ,
type : "string" ,
} ,
2021-06-23 19:36:32 -07:00
"collection" : {
alias : "c" ,
describe : "Collection name to crawl to (replay will be accessible under this name in pywb preview)" ,
type : "string" ,
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
default : "crawl-@ts"
2021-06-23 19:36:32 -07:00
} ,
"headless" : {
describe : "Run in headless mode, otherwise start xvfb" ,
type : "boolean" ,
default : false ,
} ,
"driver" : {
describe : "JS driver for the crawler" ,
type : "string" ,
2022-10-24 15:30:10 +02:00
default : "./defaultDriver.js" ,
2021-06-23 19:36:32 -07:00
} ,
"generateCDX" : {
alias : [ "generatecdx" , "generateCdx" ] ,
describe : "If set, generate index (CDXJ) for use with pywb after crawl is done" ,
type : "boolean" ,
default : false ,
} ,
"combineWARC" : {
alias : [ "combinewarc" , "combineWarc" ] ,
describe : "If set, combine the warcs" ,
type : "boolean" ,
default : false ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"rolloverSize" : {
describe : "If set, declare the rollover size" ,
default : 1000000000 ,
type : "number" ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"generateWACZ" : {
alias : [ "generatewacz" , "generateWacz" ] ,
describe : "If set, generate wacz" ,
type : "boolean" ,
default : false ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"logging" : {
2022-12-15 12:38:41 -05:00
describe : "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug" ,
2021-06-23 19:36:32 -07:00
type : "string" ,
default : "stats" ,
} ,
2021-07-07 18:56:52 -04:00
2023-04-01 13:07:59 -04:00
"logLevel" : {
describe : "Comma-separated list of log levels to include in logs" ,
type : "string" ,
default : "" ,
} ,
"context" : {
describe : "Comma-separated list of contexts to include in logs" ,
type : "string" ,
default : "" ,
} ,
2021-06-23 19:36:32 -07:00
"text" : {
describe : "If set, extract text to the pages.jsonl file" ,
type : "boolean" ,
default : false ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"cwd" : {
describe : "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()" ,
type : "string" ,
default : process . cwd ( ) ,
} ,
"mobileDevice" : {
2023-04-26 15:41:35 -07:00
describe : "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts" ,
2021-06-23 19:36:32 -07:00
type : "string" ,
} ,
"userAgent" : {
describe : "Override user-agent with specified string" ,
type : "string" ,
} ,
"userAgentSuffix" : {
describe : "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)" ,
type : "string" ,
} ,
"useSitemap" : {
2021-07-06 20:22:27 -07:00
alias : "sitemap" ,
2021-06-23 19:36:32 -07:00
describe : "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified" ,
} ,
"statsFilename" : {
describe : "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
} ,
"behaviors" : {
describe : "Which background behaviors to enable on each page" ,
2022-09-08 23:39:26 -07:00
default : "autoplay,autofetch,autoscroll,siteSpecific" ,
2021-06-23 19:36:32 -07:00
type : "string" ,
} ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
"behaviorTimeout" : {
describe : "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish." ,
default : 90 ,
type : "number" ,
} ,
2023-03-22 14:50:18 -04:00
"pageExtraDelay" : {
alias : "delay" ,
describe : "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page" ,
default : 0 ,
type : "number" ,
} ,
2023-07-06 18:54:35 +00:00
"dedupPolicy" : {
describe : "Deduplication policy" ,
default : "skip" ,
type : "string" ,
choices : [ "skip" , "revisit" , "keep" ] ,
} ,
2021-06-23 19:36:32 -07:00
"profile" : {
describe : "Path to tar.gz file which will be extracted and used as the browser profile" ,
type : "string" ,
} ,
2022-12-21 12:06:13 -05:00
"screenshot" : {
describe : "Screenshot options for crawler, can include: view, thumbnail, fullPage (comma-separated list)" ,
type : "string" ,
default : "" ,
} ,
2021-06-23 19:36:32 -07:00
"screencastPort" : {
describe : "If set to a non-zero value, starts an HTTP server with screencast accessible on this port" ,
type : "number" ,
default : 0
} ,
2021-07-07 18:56:52 -04:00
2022-02-23 12:09:48 -08:00
"screencastRedis" : {
describe : "If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set" ,
type : "boolean" ,
default : false
} ,
2021-07-07 18:56:52 -04:00
"warcInfo" : {
alias : [ "warcinfo" ] ,
describe : "Optional fields added to the warcinfo record in combined WARCs" ,
type : "object"
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
} ,
"redisStoreUrl" : {
describe : "If set, url for remote redis server to store state. Otherwise, using in-memory store" ,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
type : "string" ,
default : "redis://localhost:6379/0"
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
} ,
"saveState" : {
describe : "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted" ,
type : "string" ,
default : "partial" ,
choices : [ "never" , "partial" , "always" ]
2022-03-14 10:41:56 -07:00
} ,
"saveStateInterval" : {
describe : "If save state is set to 'always', also save state during the crawl at this interval (in seconds)" ,
type : "number" ,
default : 300 ,
} ,
"saveStateHistory" : {
describe : "Number of save states to keep during the duration of a crawl" ,
type : "number" ,
default : 5 ,
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
} ,
"sizeLimit" : {
describe : "If set, save state and exit if size limit exceeds this value" ,
type : "number" ,
default : 0 ,
} ,
2023-03-31 12:35:18 -04:00
"diskUtilization" : {
describe : "If set, save state and exit if disk utilization exceeds this percentage value" ,
type : "number" ,
default : 90 ,
} ,
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
"timeLimit" : {
describe : "If set, save state and exit after time limit, in seconds" ,
type : "number" ,
default : 0 ,
} ,
"healthCheckPort" : {
describe : "port to run healthcheck on" ,
type : "number" ,
default : 0 ,
} ,
"overwrite" : {
describe : "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started" ,
type : "boolean" ,
default : false
} ,
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
"waitOnDone" : {
describe : "if set, wait for interrupt signal when finished instead of exiting" ,
type : "boolean" ,
default : false
} ,
2022-07-08 17:17:46 -07:00
"netIdleWait" : {
2022-09-08 23:39:26 -07:00
describe : "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope" ,
2022-07-08 17:17:46 -07:00
type : "number" ,
2022-09-08 23:39:26 -07:00
default : - 1
2022-11-21 11:59:37 -08:00
} ,
"lang" : {
describe : "if set, sets the language used by the browser, should be ISO 639 language[-country] code" ,
type : "string"
2023-04-04 10:46:03 -04:00
} ,
"title" : {
describe : "If set, write supplied title into WACZ datapackage.json metadata" ,
type : "string"
} ,
"description" : {
alias : [ "desc" ] ,
describe : "If set, write supplied description into WACZ datapackage.json metadata" ,
type : "string"
2023-04-11 11:32:52 -04:00
} ,
2023-04-19 19:17:15 -07:00
"originOverride" : {
describe : "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port" ,
type : "array" ,
default : [ ] ,
} ,
2023-04-11 11:32:52 -04:00
"logErrorsToRedis" : {
2023-04-26 15:41:35 -07:00
describe : "If set, write error messages to redis" ,
2023-04-11 11:32:52 -04:00
type : "boolean" ,
default : false ,
2023-04-26 15:41:35 -07:00
} ,
2023-04-26 19:49:32 -04:00
"failOnFailedSeed" : {
describe : "If set, crawler will fail with exit code 1 if any seed fails" ,
type : "boolean" ,
default : false
2023-07-06 16:09:48 -04:00
} ,
"customBehaviors" : {
describe : "injects a custom behavior file or set of behavior files in a directory" ,
type : [ "string" ]
} ,
2021-06-23 19:36:32 -07:00
} ;
}
parseArgs ( argv ) {
argv = argv || process . argv ;
2021-07-07 18:56:52 -04:00
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if ( process . env . CRAWL _ARGS ) {
2023-08-15 18:37:11 -07:00
argv = argv . concat ( splitArgsQuoteSafe ( process . env . CRAWL _ARGS ) ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
}
let origConfig = { } ;
const parsed = yargs ( hideBin ( argv ) )
2021-06-23 19:36:32 -07:00
. usage ( "crawler [options]" )
. option ( this . cliOpts )
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
. config ( "config" , "Path to YAML config file" , ( configPath ) => {
if ( configPath === "/crawls/stdin" ) {
configPath = process . stdin . fd ;
}
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
origConfig = yaml . load ( fs . readFileSync ( configPath , "utf8" ) ) ;
return origConfig ;
2021-06-23 19:36:32 -07:00
} )
. check ( ( argv ) => this . validateArgs ( argv ) )
. argv ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
return { parsed , origConfig } ;
2021-06-23 19:36:32 -07:00
}
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
validateArgs ( argv ) {
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
argv . collection = interpolateFilename ( argv . collection , argv . crawlId ) ;
2021-06-23 19:36:32 -07:00
// Check that the collection name is valid.
if ( argv . collection . search ( /^[\w][\w-]*$/ ) === - 1 ) {
2022-12-15 12:38:41 -05:00
logger . fatal ( ` \n ${ argv . collection } is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ] \n ` ) ;
2021-06-23 19:36:32 -07:00
}
2023-04-26 15:41:35 -07:00
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
// can be multiple separate by comma
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
if ( typeof argv . waitUntil != "object" ) {
argv . waitUntil = argv . waitUntil . split ( "," ) ;
}
for ( const opt of argv . waitUntil ) {
if ( ! WAIT _UNTIL _OPTS . includes ( opt ) ) {
logger . fatal ( "Invalid waitUntil option, must be one of: " + WAIT _UNTIL _OPTS . join ( "," ) ) ;
}
2021-06-23 19:36:32 -07:00
}
2022-12-21 12:06:13 -05:00
// validate screenshot options
2023-01-12 19:58:04 -08:00
if ( argv . screenshot ) {
const passedScreenshotTypes = argv . screenshot . split ( "," ) ;
argv . screenshot = [ ] ;
passedScreenshotTypes . forEach ( ( element ) => {
if ( element in screenshotTypes ) {
argv . screenshot . push ( element ) ;
} else {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
logger . warn ( ` ${ element } not found in ${ screenshotTypes } ` ) ;
2023-01-12 19:58:04 -08:00
}
} ) ;
}
2022-12-21 12:06:13 -05:00
2021-06-23 19:36:32 -07:00
// log options
argv . logging = argv . logging . split ( "," ) ;
2023-04-01 13:07:59 -04:00
argv . logLevel = argv . logLevel ? argv . logLevel . split ( "," ) : [ ] ;
argv . context = argv . context ? argv . context . split ( "," ) : [ ] ;
2021-06-23 19:36:32 -07:00
// background behaviors to apply
const behaviorOpts = { } ;
if ( typeof argv . behaviors != "object" ) {
argv . behaviors = argv . behaviors . split ( "," ) ;
}
argv . behaviors . forEach ( ( x ) => behaviorOpts [ x ] = true ) ;
2022-12-15 12:38:41 -05:00
behaviorOpts . log = BEHAVIOR _LOG _FUNC ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
argv . behaviorOpts = JSON . stringify ( behaviorOpts ) ;
2021-06-23 19:36:32 -07:00
2022-12-21 12:06:13 -05:00
if ( argv . newContext ) {
2022-12-15 12:38:41 -05:00
logger . info ( "Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored" ) ;
2021-06-23 19:36:32 -07:00
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
2021-06-23 19:36:32 -07:00
if ( argv . mobileDevice ) {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
argv . emulateDevice = devices [ argv . mobileDevice . replace ( "-" , " " ) ] ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( ! argv . emulateDevice ) {
2022-12-15 12:38:41 -05:00
logger . fatal ( "Unknown device: " + argv . mobileDevice ) ;
2021-06-23 19:36:32 -07:00
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
} else {
argv . emulateDevice = { viewport : null } ;
2021-06-23 19:36:32 -07:00
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( argv . seedFile ) {
const urlSeedFile = fs . readFileSync ( argv . seedFile , "utf8" ) ;
const urlSeedFileList = urlSeedFile . split ( "\n" ) ;
2021-06-23 19:36:32 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( typeof ( argv . seeds ) === "string" ) {
argv . seeds = [ argv . seeds ] ;
}
for ( const seed of urlSeedFileList ) {
if ( seed ) {
argv . seeds . push ( seed ) ;
}
2021-06-23 19:36:32 -07:00
}
}
2022-09-08 23:39:26 -07:00
if ( argv . netIdleWait === - 1 ) {
if ( argv . scopeType === "page" || argv . scopeType === "page-spa" ) {
argv . netIdleWait = 15 ;
} else {
argv . netIdleWait = 2 ;
}
2023-03-08 21:31:19 -05:00
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
2022-09-08 23:39:26 -07:00
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
const scopeOpts = {
2021-07-06 20:22:27 -07:00
scopeType : argv . scopeType ,
sitemap : argv . sitemap ,
include : argv . include ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
exclude : argv . exclude ,
depth : argv . depth ,
2022-01-15 09:03:09 -08:00
extraHops : argv . extraHops ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
} ;
argv . scopedSeeds = [ ] ;
2021-06-23 19:36:32 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
for ( let seed of argv . seeds ) {
if ( typeof ( seed ) === "string" ) {
seed = { url : seed } ;
2021-06-23 19:36:32 -07:00
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
argv . scopedSeeds . push ( new ScopedSeed ( { ... scopeOpts , ... seed } ) ) ;
2021-06-23 19:36:32 -07:00
}
// Resolve statsFilename
if ( argv . statsFilename ) {
argv . statsFilename = path . resolve ( argv . cwd , argv . statsFilename ) ;
}
2023-03-31 12:35:18 -04:00
if ( ( argv . diskUtilization < 0 || argv . diskUtilization > 99 ) ) {
argv . diskUtilization = 90 ;
}
2021-06-23 19:36:32 -07:00
return true ;
}
}
2022-10-24 15:30:10 +02:00
export function parseArgs ( argv ) {
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
return new ArgParser ( ) . parseArgs ( argv ) ;
2022-10-24 15:30:10 +02:00
}
2023-08-15 18:37:11 -07:00
export function splitArgsQuoteSafe ( crawlArgs ) {
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
const regex = /"[^"]+"|[^\s]+/g ;
return crawlArgs . match ( regex ) . map ( e => e . replace ( /"(.+)"/ , "$1" ) ) ;
}