2021-06-23 19:36:32 -07:00
const path = require ( "path" ) ;
const fs = require ( "fs" ) ;
const child _process = require ( "child_process" ) ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
const yaml = require ( "js-yaml" ) ;
const puppeteer = require ( "puppeteer-core" ) ;
const { Cluster } = require ( "puppeteer-cluster" ) ;
2021-06-23 19:36:32 -07:00
const yargs = require ( "yargs/yargs" ) ;
const { hideBin } = require ( "yargs/helpers" ) ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
const { NewWindowPage } = require ( "./screencaster" ) ;
const { BEHAVIOR _LOG _FUNC , WAIT _UNTIL _OPTS } = require ( "./constants" ) ;
const { ScopedSeed } = require ( "./seeds" ) ;
2021-06-23 19:36:32 -07:00
// ============================================================================
class ArgParser {
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
constructor ( profileDir ) {
this . profileDir = profileDir ;
}
2021-06-23 19:36:32 -07:00
get cliOpts ( ) {
return {
"seeds" : {
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
alias : "url" ,
2021-06-23 19:36:32 -07:00
describe : "The URL to start crawling from" ,
type : "array" ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
default : [ ] ,
2021-06-23 19:36:32 -07:00
} ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
"seedFile" : {
alias : [ "urlFile" ] ,
describe : "If set, read a list of seed urls, one per line, from the specified" ,
type : "string" ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"workers" : {
alias : "w" ,
describe : "The number of workers to run in parallel" ,
default : 1 ,
type : "number" ,
} ,
"newContext" : {
describe : "The context for each new capture, can be a new: page, window, session or browser." ,
default : "page" ,
type : "string"
} ,
"waitUntil" : {
describe : "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','" ,
default : "load,networkidle0" ,
} ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
"depth" : {
describe : "The depth of the crawl for all seeds" ,
default : - 1 ,
type : "number" ,
} ,
2021-06-23 19:36:32 -07:00
"limit" : {
describe : "Limit crawl to this number of pages" ,
default : 0 ,
type : "number" ,
} ,
"timeout" : {
describe : "Timeout for each page to load (in seconds)" ,
default : 90 ,
type : "number" ,
} ,
"scopeType" : {
2021-07-06 20:22:27 -07:00
describe : "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx" ,
2021-06-23 19:36:32 -07:00
type : "string" ,
} ,
2021-07-06 20:22:27 -07:00
"scopeIncludeRx" : {
alias : "include" ,
describe : "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)" ,
} ,
"scopeExcludeRx" : {
alias : "exclude" ,
2021-06-23 19:36:32 -07:00
describe : "Regex of page URLs that should be excluded from the crawl."
} ,
"allowHashUrls" : {
describe : "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" ,
} ,
2021-07-19 15:49:43 -07:00
"blockRules" : {
describe : "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" ,
type : "array" ,
default : [ ] ,
} ,
"blockMessage" : {
describe : "If specified, when a URL is blocked, a record with this error message is added instead" ,
type : "string" ,
} ,
2021-06-23 19:36:32 -07:00
"collection" : {
alias : "c" ,
describe : "Collection name to crawl to (replay will be accessible under this name in pywb preview)" ,
type : "string" ,
default : ` capture- ${ new Date ( ) . toISOString ( ) . slice ( 0 , 19 ) } ` . replace ( /:/g , "-" )
} ,
"headless" : {
describe : "Run in headless mode, otherwise start xvfb" ,
type : "boolean" ,
default : false ,
} ,
"driver" : {
describe : "JS driver for the crawler" ,
type : "string" ,
default : path . join ( _ _dirname , ".." , "defaultDriver.js" ) ,
} ,
"generateCDX" : {
alias : [ "generatecdx" , "generateCdx" ] ,
describe : "If set, generate index (CDXJ) for use with pywb after crawl is done" ,
type : "boolean" ,
default : false ,
} ,
"combineWARC" : {
alias : [ "combinewarc" , "combineWarc" ] ,
describe : "If set, combine the warcs" ,
type : "boolean" ,
default : false ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"rolloverSize" : {
describe : "If set, declare the rollover size" ,
default : 1000000000 ,
type : "number" ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"generateWACZ" : {
alias : [ "generatewacz" , "generateWacz" ] ,
describe : "If set, generate wacz" ,
type : "boolean" ,
default : false ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"logging" : {
describe : "Logging options for crawler, can include: stats, pywb, behaviors, behaviors-debug" ,
type : "string" ,
default : "stats" ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"text" : {
describe : "If set, extract text to the pages.jsonl file" ,
type : "boolean" ,
default : false ,
} ,
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
"cwd" : {
describe : "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()" ,
type : "string" ,
default : process . cwd ( ) ,
} ,
"mobileDevice" : {
describe : "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts" ,
type : "string" ,
} ,
"userAgent" : {
describe : "Override user-agent with specified string" ,
type : "string" ,
} ,
"userAgentSuffix" : {
describe : "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)" ,
type : "string" ,
} ,
"useSitemap" : {
2021-07-06 20:22:27 -07:00
alias : "sitemap" ,
2021-06-23 19:36:32 -07:00
describe : "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified" ,
} ,
"statsFilename" : {
describe : "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
} ,
"behaviors" : {
describe : "Which background behaviors to enable on each page" ,
default : "autoplay,autofetch,siteSpecific" ,
type : "string" ,
} ,
"profile" : {
describe : "Path to tar.gz file which will be extracted and used as the browser profile" ,
type : "string" ,
} ,
"screencastPort" : {
describe : "If set to a non-zero value, starts an HTTP server with screencast accessible on this port" ,
type : "number" ,
default : 0
} ,
2021-07-07 18:56:52 -04:00
"warcInfo" : {
alias : [ "warcinfo" ] ,
describe : "Optional fields added to the warcinfo record in combined WARCs" ,
type : "object"
}
2021-06-23 19:36:32 -07:00
} ;
}
parseArgs ( argv ) {
argv = argv || process . argv ;
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
return yargs ( hideBin ( argv ) )
. usage ( "crawler [options]" )
. option ( this . cliOpts )
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
. config ( "config" , "Path to YAML config file" , ( configPath ) => {
if ( configPath === "/crawls/stdin" ) {
configPath = process . stdin . fd ;
}
return yaml . load ( fs . readFileSync ( configPath , "utf8" ) ) ;
2021-06-23 19:36:32 -07:00
} )
. check ( ( argv ) => this . validateArgs ( argv ) )
. argv ;
}
2021-07-07 18:56:52 -04:00
2021-06-23 19:36:32 -07:00
validateArgs ( argv ) {
// Check that the collection name is valid.
if ( argv . collection . search ( /^[\w][\w-]*$/ ) === - 1 ) {
throw new Error ( ` \n ${ argv . collection } is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ] \n ` ) ;
}
argv . timeout *= 1000 ;
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
// can be multiple separate by comma
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
if ( typeof argv . waitUntil != "object" ) {
argv . waitUntil = argv . waitUntil . split ( "," ) ;
}
for ( const opt of argv . waitUntil ) {
if ( ! WAIT _UNTIL _OPTS . includes ( opt ) ) {
throw new Error ( "Invalid waitUntil option, must be one of: " + WAIT _UNTIL _OPTS . join ( "," ) ) ;
}
}
// log options
argv . logging = argv . logging . split ( "," ) ;
// background behaviors to apply
const behaviorOpts = { } ;
if ( typeof argv . behaviors != "object" ) {
argv . behaviors = argv . behaviors . split ( "," ) ;
}
argv . behaviors . forEach ( ( x ) => behaviorOpts [ x ] = true ) ;
if ( argv . logging . includes ( "behaviors" ) ) {
behaviorOpts . log = BEHAVIOR _LOG _FUNC ;
} else if ( argv . logging . includes ( "behaviors-debug" ) ) {
behaviorOpts . log = BEHAVIOR _LOG _FUNC ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
argv . behaviorsLogDebug = true ;
2021-06-23 19:36:32 -07:00
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
argv . behaviorOpts = JSON . stringify ( behaviorOpts ) ;
2021-06-23 19:36:32 -07:00
if ( ! argv . newContext ) {
argv . newContext = "page" ;
}
switch ( argv . newContext ) {
case "page" :
argv . newContext = Cluster . CONCURRENCY _PAGE ;
if ( argv . screencastPort && argv . workers > 1 ) {
console . warn ( "Note: Screencast with >1 workers and default page context may only show one page at a time. To fix, add '--newContext window' to open each page in a new window" ) ;
}
break ;
case "session" :
argv . newContext = Cluster . CONCURRENCY _CONTEXT ;
break ;
case "browser" :
argv . newContext = Cluster . CONCURRENCY _BROWSER ;
break ;
case "window" :
argv . newContext = NewWindowPage ;
break ;
default :
throw new Error ( "Invalid newContext, must be one of: page, session, browser" ) ;
}
if ( argv . mobileDevice ) {
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
argv . emulateDevice = puppeteer . devices [ argv . mobileDevice ] ;
if ( ! argv . emulateDevice ) {
2021-06-23 19:36:32 -07:00
throw new Error ( "Unknown device: " + argv . mobileDevice ) ;
}
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( argv . seedFile ) {
const urlSeedFile = fs . readFileSync ( argv . seedFile , "utf8" ) ;
const urlSeedFileList = urlSeedFile . split ( "\n" ) ;
2021-06-23 19:36:32 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( typeof ( argv . seeds ) === "string" ) {
argv . seeds = [ argv . seeds ] ;
}
for ( const seed of urlSeedFileList ) {
if ( seed ) {
argv . seeds . push ( seed ) ;
}
2021-06-23 19:36:32 -07:00
}
}
2021-07-06 20:22:27 -07:00
if ( argv . include || argv . exclude ) {
if ( argv . scopeType && argv . scopeType !== "custom" ) {
console . warn ( "You've specified a --scopeType and a --scopeIncludeRx or --scopeExcludeRx regex. The custom scope regex will take precedence, overriding the scopeType" ) ;
argv . scopeType = "custom" ;
}
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
const scopeOpts = {
2021-07-06 20:22:27 -07:00
scopeType : argv . scopeType ,
sitemap : argv . sitemap ,
include : argv . include ,
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
exclude : argv . exclude ,
depth : argv . depth ,
} ;
argv . scopedSeeds = [ ] ;
2021-06-23 19:36:32 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
for ( let seed of argv . seeds ) {
if ( typeof ( seed ) === "string" ) {
seed = { url : seed } ;
2021-06-23 19:36:32 -07:00
}
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
argv . scopedSeeds . push ( new ScopedSeed ( { ... scopeOpts , ... seed } ) ) ;
2021-06-23 19:36:32 -07:00
}
// Resolve statsFilename
if ( argv . statsFilename ) {
argv . statsFilename = path . resolve ( argv . cwd , argv . statsFilename ) ;
}
if ( argv . profile ) {
child _process . execSync ( "tar xvfz " + argv . profile , { cwd : this . profileDir } ) ;
}
return true ;
}
}
2021-07-07 18:56:52 -04:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
module . exports . parseArgs = function ( profileDir , argv ) {
return new ArgParser ( profileDir ) . parseArgs ( argv ) ;
2021-06-23 19:36:32 -07:00
} ;