2022-10-24 15:30:10 +02:00
import child _process from "child_process" ;
import path from "path" ;
import fs from "fs" ;
import os from "os" ;
import fsp from "fs/promises" ;
2021-02-08 22:21:34 -08:00
2022-10-24 15:30:10 +02:00
import fetch from "node-fetch" ;
import { RedisCrawlState , MemoryCrawlState } from "./util/state.js" ;
import AbortController from "abort-controller" ;
import Sitemapper from "sitemapper" ;
import { v4 as uuidv4 } from "uuid" ;
import yaml from "js-yaml" ;
import * as warcio from "warcio" ;
2020-10-31 13:16:37 -07:00
2023-03-08 21:31:19 -05:00
import { HealthChecker } from "./util/healthcheck.js" ;
2022-10-24 15:30:10 +02:00
import { TextExtract } from "./util/textextract.js" ;
import { initStorage , getFileSize , getDirSize , interpolateFilename } from "./util/storage.js" ;
import { ScreenCaster , WSTransport , RedisPubSubTransport } from "./util/screencaster.js" ;
2022-12-21 12:06:13 -05:00
import { Screenshots } from "./util/screenshots.js" ;
2022-10-24 15:30:10 +02:00
import { parseArgs } from "./util/argParser.js" ;
import { initRedis } from "./util/redis.js" ;
2023-03-08 21:31:19 -05:00
import { Logger , errJSON , setExternalLogStream , setDebugLogging } from "./util/logger.js" ;
import { WorkerPool } from "./util/worker.js" ;
2020-11-01 19:22:53 -08:00
2022-10-24 15:30:10 +02:00
import { getBrowserExe , loadProfile , chromeArgs , getDefaultUA , evaluateWithCLI } from "./util/browser.js" ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2022-10-24 15:30:10 +02:00
import { BEHAVIOR _LOG _FUNC , HTML _TYPES , DEFAULT _SELECTORS } from "./util/constants.js" ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2022-10-25 10:53:32 -04:00
import { AdBlockRules , BlockRules } from "./util/blockrules.js" ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2022-10-24 15:30:10 +02:00
// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http" ;
import { Agent as HTTPSAgent } from "https" ;
const HTTPS _AGENT = HTTPSAgent ( {
rejectUnauthorized : false ,
} ) ;
2021-06-07 17:43:36 -07:00
2022-10-24 15:30:10 +02:00
const HTTP _AGENT = HTTPAgent ( ) ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
2022-10-24 15:30:10 +02:00
const behaviors = fs . readFileSync ( new URL ( "./node_modules/browsertrix-behaviors/dist/behaviors.js" , import . meta . url ) , { encoding : "utf8" } ) ;
2020-10-31 13:16:37 -07:00
2023-03-10 23:11:24 -05:00
const FETCH _TIMEOUT _SECS = 30 ;
const PAGE _OP _TIMEOUT _SECS = 5 ;
2023-03-08 21:31:19 -05:00
2020-11-01 19:22:53 -08:00
// ============================================================================
2022-10-24 15:30:10 +02:00
export class Crawler {
2020-11-01 19:22:53 -08:00
constructor ( ) {
2022-12-15 12:38:41 -05:00
const res = parseArgs ( ) ;
this . params = res . parsed ;
this . origConfig = res . origConfig ;
2023-02-24 18:31:08 -08:00
// root collections dir
this . collDir = path . join ( this . params . cwd , "collections" , this . params . collection ) ;
this . logDir = path . join ( this . collDir , "logs" ) ;
this . logFilename = path . join ( this . logDir , ` crawl- ${ new Date ( ) . toISOString ( ) . replace ( /[^\d]/g , "" ) } .log ` ) ;
2022-12-15 12:38:41 -05:00
const debugLogging = this . params . logging . includes ( "debug" ) ;
2023-03-08 21:31:19 -05:00
setDebugLogging ( debugLogging ) ;
this . logger = new Logger ( ) ;
2023-02-24 18:31:08 -08:00
this . logger . debug ( "Writing log to: " + this . logFilename , { } , "init" ) ;
2022-12-15 12:38:41 -05:00
2020-11-03 17:16:29 +00:00
this . headers = { } ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
this . crawlState = null ;
2020-11-01 19:22:53 -08:00
2020-11-14 19:32:31 +00:00
this . emulateDevice = null ;
2021-04-29 14:34:56 -07:00
// pages file
this . pagesFH = null ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
this . crawlId = process . env . CRAWL _ID || os . hostname ( ) ;
this . startTime = Date . now ( ) ;
2021-01-29 18:26:55 +00:00
// was the limit hit?
this . limitHit = false ;
2020-11-14 19:32:31 +00:00
this . userAgent = "" ;
2022-03-14 10:41:56 -07:00
this . saveStateFiles = [ ] ;
this . lastSaveTime = 0 ;
2022-12-15 12:38:41 -05:00
this . saveStateInterval = this . params . saveStateInterval * 1000 ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
this . emulateDevice = this . params . emulateDevice ;
2020-11-01 19:22:53 -08:00
2021-07-19 15:49:43 -07:00
this . captureBasePrefix = ` http:// ${ process . env . PROXY _HOST } : ${ process . env . PROXY _PORT } / ${ this . params . collection } /record ` ;
2022-08-11 18:44:39 -07:00
this . capturePrefix = process . env . NO _PROXY ? "" : this . captureBasePrefix + "/id_/" ;
2021-02-04 00:28:32 -05:00
2021-05-21 15:37:02 -07:00
this . gotoOpts = {
waitUntil : this . params . waitUntil ,
timeout : this . params . timeout
} ;
2021-02-04 00:28:32 -05:00
// pages directory
this . pagesDir = path . join ( this . collDir , "pages" ) ;
2021-06-23 19:36:32 -07:00
2021-02-04 00:28:32 -05:00
// pages file
this . pagesFile = path . join ( this . pagesDir , "pages.jsonl" ) ;
2021-07-19 15:49:43 -07:00
this . blockRules = null ;
2022-10-25 10:53:32 -04:00
this . adBlockRules = null ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
2023-03-08 21:31:19 -05:00
this . healthChecker = null ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
2022-09-20 17:09:52 -07:00
this . interrupted = false ;
this . finalExit = false ;
this . clearOnExit = false ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
this . done = false ;
2022-09-20 17:09:52 -07:00
2022-07-08 17:17:46 -07:00
this . behaviorLastLine = null ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
}
2020-11-14 19:32:31 +00:00
configureUA ( ) {
// override userAgent
if ( this . params . userAgent ) {
if ( this . emulateDevice ) {
this . emulateDevice . userAgent = this . params . userAgent ;
}
this . userAgent = this . params . userAgent ;
return ;
}
// if device set, it overrides the default Chrome UA
if ( this . emulateDevice ) {
this . userAgent = this . emulateDevice . userAgent ;
} else {
2022-03-18 10:32:59 -07:00
this . userAgent = getDefaultUA ( ) ;
2020-11-14 19:32:31 +00:00
}
// suffix to append to default userAgent
if ( this . params . userAgentSuffix ) {
this . userAgent += " " + this . params . userAgentSuffix ;
if ( this . emulateDevice ) {
this . emulateDevice . userAgent += " " + this . params . userAgentSuffix ;
}
}
}
2021-06-23 19:36:32 -07:00
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
async initCrawlState ( ) {
const redisUrl = this . params . redisStoreUrl ;
if ( redisUrl ) {
if ( ! redisUrl . startsWith ( "redis://" ) ) {
2022-12-15 12:38:41 -05:00
this . logger . fatal ( "stateStoreUrl must start with redis:// -- Only redis-based store currently supported" ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
}
2021-11-23 12:53:30 -08:00
let redis ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
while ( true ) {
try {
redis = await initRedis ( redisUrl ) ;
break ;
} catch ( e ) {
2022-12-15 12:38:41 -05:00
//this.logger.fatal("Unable to connect to state store Redis: " + redisUrl);
this . logger . warn ( ` Waiting for redis at ${ redisUrl } ` , { } , "state" ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
await this . sleep ( 3 ) ;
}
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
}
2023-03-08 21:31:19 -05:00
this . logger . debug ( ` Storing state via Redis ${ redisUrl } @ key prefix " ${ this . crawlId } " ` , { } , "state" ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
2023-02-23 18:50:22 -08:00
this . crawlState = new RedisCrawlState ( redis , this . params . crawlId , this . params . behaviorTimeout + this . params . timeout , os . hostname ( ) ) ;
2022-02-23 12:09:48 -08:00
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
} else {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Storing state in memory" , { } , "state" ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
this . crawlState = new MemoryCrawlState ( ) ;
}
2022-03-14 10:41:56 -07:00
if ( this . params . saveState === "always" && this . params . saveStateInterval ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( ` Saving crawl state every ${ this . params . saveStateInterval } seconds, keeping last ${ this . params . saveStateHistory } states ` , { } , "state" ) ;
2022-03-14 10:41:56 -07:00
}
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
return this . crawlState ;
}
2022-02-23 12:09:48 -08:00
initScreenCaster ( ) {
let transport ;
if ( this . params . screencastPort ) {
transport = new WSTransport ( this . params . screencastPort ) ;
2022-12-15 12:38:41 -05:00
this . logger . debug ( ` Screencast server started on: ${ this . params . screencastPort } ` , { } , "screencast" ) ;
2022-02-23 12:09:48 -08:00
} else if ( this . params . redisStoreUrl && this . params . screencastRedis ) {
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
transport = new RedisPubSubTransport ( this . params . redisStoreUrl , this . crawlId ) ;
2022-12-15 12:38:41 -05:00
this . logger . debug ( "Screencast enabled via redis pubsub" , { } , "screencast" ) ;
2022-02-23 12:09:48 -08:00
}
if ( ! transport ) {
return null ;
}
2022-03-02 13:26:11 -08:00
return new ScreenCaster ( transport , this . params . workers ) ;
2022-02-23 12:09:48 -08:00
}
2022-08-11 18:44:39 -07:00
async bootstrap ( ) {
2023-03-08 21:31:19 -05:00
const initRes = child _process . spawnSync ( "wb-manager" , [ "init" , this . params . collection ] , { cwd : this . params . cwd } ) ;
if ( initRes . status ) {
this . logger . info ( "wb-manager init failed, collection likely already exists" ) ;
}
fs . mkdirSync ( this . logDir , { recursive : true } ) ;
this . logFH = fs . createWriteStream ( this . logFilename ) ;
setExternalLogStream ( this . logFH ) ;
this . infoString = await this . getInfoString ( ) ;
this . logger . info ( this . infoString ) ;
this . logger . info ( "Seeds" , this . params . scopedSeeds ) ;
if ( this . params . profile ) {
this . logger . info ( "With Browser Profile" , { url : this . params . profile } ) ;
}
2023-02-03 00:02:47 -05:00
if ( this . params . overwrite ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( ` Clearing ${ this . collDir } before starting ` ) ;
2023-02-03 00:02:47 -05:00
try {
fs . rmSync ( this . collDir , { recursive : true , force : true } ) ;
} catch ( e ) {
this . logger . error ( ` Unable to clear ${ this . collDir } ` , e ) ;
}
}
2021-03-31 13:41:27 -04:00
let opts = { } ;
2022-08-11 18:44:39 -07:00
let redisStdio ;
2021-03-13 16:48:31 -08:00
if ( this . params . logging . includes ( "pywb" ) ) {
2023-02-24 18:31:08 -08:00
const pywbStderr = fs . openSync ( path . join ( this . logDir , "pywb.log" ) , "a" ) ;
2022-08-11 18:44:39 -07:00
const stdio = [ process . stdin , pywbStderr , pywbStderr ] ;
2023-02-24 18:31:08 -08:00
const redisStderr = fs . openSync ( path . join ( this . logDir , "redis.log" ) , "a" ) ;
2022-08-11 18:44:39 -07:00
redisStdio = [ process . stdin , redisStderr , redisStderr ] ;
opts = { stdio , cwd : this . params . cwd } ;
} else {
2021-03-04 15:36:58 -05:00
opts = { stdio : "ignore" , cwd : this . params . cwd } ;
2022-08-11 18:44:39 -07:00
redisStdio = "ignore" ;
2021-03-04 15:36:58 -05:00
}
2020-11-01 19:22:53 -08:00
2022-03-18 10:32:59 -07:00
this . browserExe = getBrowserExe ( ) ;
2020-11-14 20:51:07 +00:00
this . configureUA ( ) ;
2021-06-23 19:36:32 -07:00
2020-11-03 17:16:29 +00:00
this . headers = { "User-Agent" : this . userAgent } ;
2021-05-21 15:37:02 -07:00
const subprocesses = [ ] ;
2022-08-11 18:44:39 -07:00
subprocesses . push ( child _process . spawn ( "redis-server" , { cwd : "/tmp/" , stdio : redisStdio } ) ) ;
2021-05-21 15:37:02 -07:00
2021-03-31 13:41:27 -04:00
opts . env = { ... process . env , COLL : this . params . collection , ROLLOVER _SIZE : this . params . rolloverSize } ;
2021-06-23 19:36:32 -07:00
2022-10-24 15:30:10 +02:00
subprocesses . push ( child _process . spawn ( "uwsgi" , [ new URL ( "uwsgi.ini" , import . meta . url ) . pathname ] , opts ) ) ;
2021-05-21 15:37:02 -07:00
process . on ( "exit" , ( ) => {
for ( const proc of subprocesses ) {
proc . kill ( ) ;
}
} ) ;
2020-11-01 19:22:53 -08:00
2022-08-21 00:30:25 -07:00
child _process . spawn ( "socat" , [ "tcp-listen:9222,fork" , "tcp:localhost:9221" ] ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if ( ! this . params . headless && ! process . env . NO _XVFB ) {
2020-11-01 19:22:53 -08:00
child _process . spawn ( "Xvfb" , [
process . env . DISPLAY ,
"-listen" ,
"tcp" ,
"-screen" ,
"0" ,
process . env . GEOMETRY ,
"-ac" ,
"+extension" ,
"RANDR"
] ) ;
}
}
get puppeteerArgs ( ) {
2023-03-08 21:31:19 -05:00
// Puppeteer Options
2020-11-01 19:22:53 -08:00
return {
headless : this . params . headless ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
executablePath : this . browserExe ,
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
handleSIGINT : false ,
handleSIGTERM : false ,
handleSIGHUP : false ,
2020-11-01 19:22:53 -08:00
ignoreHTTPSErrors : true ,
2022-11-21 11:59:37 -08:00
args : chromeArgs ( ! process . env . NO _PROXY , this . userAgent , this . extraChromeArgs ( ) ) ,
2021-04-10 13:08:22 -07:00
userDataDir : this . profileDir ,
defaultViewport : null ,
2023-01-09 23:56:53 -08:00
waitForInitialPage : false
2020-11-01 19:22:53 -08:00
} ;
}
2020-10-31 13:16:37 -07:00
2022-11-21 11:59:37 -08:00
extraChromeArgs ( ) {
const args = [ ] ;
if ( this . params . lang ) {
args . push ( ` --accept-lang= ${ this . params . lang } ` ) ;
}
return args ;
}
2020-11-01 19:22:53 -08:00
async run ( ) {
2022-08-11 18:44:39 -07:00
await this . bootstrap ( ) ;
2021-08-17 20:54:18 -07:00
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
let status ;
2022-09-20 17:09:52 -07:00
let exitCode = 0 ;
2020-10-31 13:16:37 -07:00
try {
2020-11-01 19:22:53 -08:00
await this . crawl ( ) ;
2022-09-20 17:09:52 -07:00
status = ( ! this . interrupted ? "done" : "interrupted" ) ;
2020-10-31 13:16:37 -07:00
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . error ( "Crawl failed" , e ) ;
2022-09-20 17:09:52 -07:00
exitCode = 9 ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
status = "failing" ;
if ( await this . crawlState . incFailCount ( ) ) {
status = "failed" ;
}
} finally {
2022-12-15 12:38:41 -05:00
this . logger . info ( ` Crawl status: ${ status } ` ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
2022-06-30 19:24:26 -07:00
if ( this . crawlState ) {
await this . crawlState . setStatus ( status ) ;
}
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
2022-09-20 17:09:52 -07:00
process . exit ( exitCode ) ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
}
2021-04-10 13:08:22 -07:00
2023-02-23 18:50:22 -08:00
_behaviorLog ( { data , type } , pageUrl ) {
2022-07-08 17:17:46 -07:00
let behaviorLine ;
2023-02-23 18:50:22 -08:00
let message ;
let details ;
if ( typeof ( data ) === "string" ) {
message = data ;
details = { } ;
} else {
message = type === "info" ? "Behavior log" : "Behavior debug" ;
details = typeof ( data ) === "object" ? data : { } ;
}
if ( pageUrl ) {
details . page = pageUrl ;
}
2022-07-08 17:17:46 -07:00
2021-04-10 13:08:22 -07:00
switch ( type ) {
case "info" :
2022-07-08 17:17:46 -07:00
behaviorLine = JSON . stringify ( data ) ;
if ( behaviorLine != this . _behaviorLastLine ) {
2023-02-23 18:50:22 -08:00
this . logger . info ( message , details , "behaviorScript" ) ;
2022-07-08 17:17:46 -07:00
this . _behaviorLastLine = behaviorLine ;
}
2021-04-10 13:08:22 -07:00
break ;
case "debug" :
default :
2023-02-23 18:50:22 -08:00
this . logger . debug ( message , details , "behaviorScript" ) ;
2021-04-10 13:08:22 -07:00
}
}
2023-03-08 21:31:19 -05:00
isInScope ( { seedId , url , depth , extraHops } = { } , logDetails = { } ) {
2022-09-20 17:09:52 -07:00
const seed = this . params . scopedSeeds [ seedId ] ;
2023-03-08 21:31:19 -05:00
return seed . isIncluded ( url , depth , extraHops , logDetails ) ;
2022-09-20 17:09:52 -07:00
}
async crawlPage ( opts ) {
2023-02-23 18:50:22 -08:00
await this . writeStats ( ) ;
2022-09-20 17:09:52 -07:00
const { page , data } = opts ;
2023-02-23 18:50:22 -08:00
const { url } = data ;
2023-03-08 21:31:19 -05:00
const logDetails = { page : url , workerid : page . _workerid } ;
2022-09-20 17:09:52 -07:00
2023-03-08 21:31:19 -05:00
if ( ! this . isInScope ( data , logDetails ) ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( "Page no longer in scope" , data ) ;
2022-09-20 17:09:52 -07:00
return ;
}
2021-02-08 22:21:34 -08:00
try {
2021-06-07 17:43:36 -07:00
if ( this . screencaster ) {
2023-02-23 18:50:22 -08:00
await this . screencaster . screencastTarget ( page . target ( ) , url ) ;
2021-06-07 17:43:36 -07:00
}
2021-02-08 22:21:34 -08:00
if ( this . emulateDevice ) {
await page . emulate ( this . emulateDevice ) ;
}
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
if ( this . params . profile ) {
2022-08-17 21:40:10 -07:00
await page . _client ( ) . send ( "Network.setBypassServiceWorker" , { bypass : true } ) ;
2021-07-20 15:45:51 -07:00
}
2022-03-18 10:32:59 -07:00
await page . evaluateOnNewDocument ( "Object.defineProperty(navigator, \"webdriver\", {value: false});" ) ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( this . params . behaviorOpts && ! page . _ _bx _inited ) {
2023-02-23 18:50:22 -08:00
await page . exposeFunction ( BEHAVIOR _LOG _FUNC , ( logdata ) => this . _behaviorLog ( logdata , url ) ) ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
await page . evaluateOnNewDocument ( behaviors + ` ; \n self.__bx_behaviors.init( ${ this . params . behaviorOpts } ); ` ) ;
2021-06-07 17:43:36 -07:00
page . _ _bx _inited = true ;
2021-03-13 16:48:31 -08:00
}
2021-02-08 22:21:34 -08:00
// run custom driver here
await this . driver ( { page , data , crawler : this } ) ;
2021-06-23 19:36:32 -07:00
2021-02-08 22:21:34 -08:00
const title = await page . title ( ) ;
2022-12-21 12:06:13 -05:00
if ( this . params . screenshot ) {
if ( ! page . isHTMLPage ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Skipping screenshots for non-HTML page" , logDetails ) ;
2022-12-21 12:06:13 -05:00
}
const archiveDir = path . join ( this . collDir , "archive" ) ;
2023-02-23 18:50:22 -08:00
const screenshots = new Screenshots ( { page , url , directory : archiveDir } ) ;
2022-12-21 12:06:13 -05:00
if ( this . params . screenshot . includes ( "view" ) ) {
await screenshots . take ( ) ;
}
if ( this . params . screenshot . includes ( "fullPage" ) ) {
await screenshots . takeFullPage ( ) ;
}
if ( this . params . screenshot . includes ( "thumbnail" ) ) {
await screenshots . takeThumbnail ( ) ;
}
}
2021-03-31 13:41:27 -04:00
let text = "" ;
2022-03-22 17:41:51 -07:00
if ( this . params . text && page . isHTMLPage ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Extracting text" , logDetails , "general" ) ;
2021-02-23 16:52:54 -05:00
const client = await page . target ( ) . createCDPSession ( ) ;
const result = await client . send ( "DOM.getDocument" , { "depth" : - 1 , "pierce" : true } ) ;
2021-03-04 15:36:58 -05:00
text = await new TextExtract ( result ) . parseTextFromDom ( ) ;
2021-02-23 16:52:54 -05:00
}
2021-06-23 19:36:32 -07:00
2022-03-14 10:41:56 -07:00
await this . writePage ( data , title , this . params . text ? text : null ) ;
2021-02-08 22:21:34 -08:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( this . params . behaviorOpts ) {
2022-03-22 17:41:51 -07:00
if ( ! page . isHTMLPage ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Skipping behaviors for non-HTML page" , logDetails , "behavior" ) ;
2022-03-22 17:41:51 -07:00
} else {
2023-02-23 18:50:22 -08:00
const behaviorTimeout = this . params . behaviorTimeout / 1000 ;
2023-03-08 21:31:19 -05:00
2023-03-10 23:11:24 -05:00
const res = await this . timedRun (
this . runBehaviors ( page , logDetails ) ,
behaviorTimeout ,
"Behaviors timed out" ,
logDetails ,
"behavior"
) ;
2023-03-08 21:31:19 -05:00
2023-02-23 18:50:22 -08:00
if ( res && res . length ) {
this . logger . info ( "Behaviors finished" , { finished : res . length , ... logDetails } , "behavior" ) ;
}
2022-03-22 17:41:51 -07:00
}
2021-02-08 22:21:34 -08:00
}
2023-02-23 18:50:22 -08:00
this . logger . info ( "Page finished" , logDetails , "pageStatus" ) ;
2021-02-08 22:21:34 -08:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
await this . checkLimits ( ) ;
2022-03-14 10:41:56 -07:00
await this . serializeConfig ( ) ;
2021-02-08 22:21:34 -08:00
} catch ( e ) {
2023-03-08 21:31:19 -05:00
this . logger . error ( "Page Errored" , { ... errJSON ( e ) , ... logDetails } , "pageStatus" ) ;
2022-08-19 09:23:40 -07:00
await this . markPageFailed ( page ) ;
2021-02-08 22:21:34 -08:00
}
}
2021-06-23 19:36:32 -07:00
2023-03-08 21:31:19 -05:00
async runBehaviors ( page , logDetails ) {
try {
const frames = page . _ _filteredFrames ;
this . logger . info ( "Running behaviors" , { frames : frames . length , frameUrls : frames . map ( frame => frame . url ( ) ) , ... logDetails } , "behavior" ) ;
return await Promise . allSettled (
frames . map ( frame => evaluateWithCLI ( frame , "self.__bx_behaviors.run();" , logDetails , "behavior" ) )
) ;
} catch ( e ) {
this . logger . warn ( "Behavior run failed" , { ... errJSON ( e ) , ... logDetails } , "behavior" ) ;
return null ;
}
}
shouldIncludeFrame ( frame , logDetails ) {
2023-01-23 16:47:33 -08:00
if ( ! frame . parentFrame ( ) ) {
return true ;
}
2023-02-23 18:50:22 -08:00
const frameUrl = frame . url ( ) ;
2023-01-23 16:47:33 -08:00
2023-02-23 18:50:22 -08:00
let res ;
if ( frameUrl === "about:blank" ) {
res = false ;
} else {
2023-03-08 21:31:19 -05:00
res = ! this . adBlockRules . isAdUrl ( frameUrl ) ;
2023-02-23 18:50:22 -08:00
}
if ( ! res ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Skipping processing frame" , { frameUrl , ... logDetails } , "behavior" ) ;
2023-01-23 16:47:33 -08:00
}
2023-02-23 18:50:22 -08:00
return res ;
2023-01-23 16:47:33 -08:00
}
2023-03-08 21:31:19 -05:00
async getInfoString ( ) {
2021-04-29 14:34:56 -07:00
const packageFileJSON = JSON . parse ( await fsp . readFile ( "../app/package.json" ) ) ;
2021-07-07 18:56:52 -04:00
const warcioPackageJSON = JSON . parse ( await fsp . readFile ( "/app/node_modules/warcio/package.json" ) ) ;
2021-06-24 15:39:17 -07:00
const pywbVersion = child _process . execSync ( "pywb -V" , { encoding : "utf8" } ) . trim ( ) . split ( " " ) [ 1 ] ;
2021-03-31 13:41:27 -04:00
2023-03-08 21:31:19 -05:00
return ` Browsertrix-Crawler ${ packageFileJSON . version } (with warcio.js ${ warcioPackageJSON . version } pywb ${ pywbVersion } ) ` ;
}
async createWARCInfo ( filename ) {
const warcVersion = "WARC/1.0" ;
const type = "warcinfo" ;
2021-03-31 13:41:27 -04:00
const info = {
2023-03-08 21:31:19 -05:00
"software" : this . infoString ,
2021-07-07 18:56:52 -04:00
"format" : "WARC File Format 1.0"
2021-03-31 13:41:27 -04:00
} ;
2022-12-21 12:06:13 -05:00
2021-07-07 18:56:52 -04:00
const warcInfo = { ... info , ... this . params . warcInfo , } ;
const record = await warcio . WARCRecord . createWARCInfo ( { filename , type , warcVersion } , warcInfo ) ;
2021-03-31 13:41:27 -04:00
const buffer = await warcio . WARCSerializer . serialize ( record , { gzip : true } ) ;
return buffer ;
}
2021-06-23 19:36:32 -07:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
async checkLimits ( ) {
let interrupt = false ;
if ( this . params . sizeLimit ) {
const dir = path . join ( this . collDir , "archive" ) ;
const size = await getDirSize ( dir ) ;
if ( size >= this . params . sizeLimit ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( ` Size threshold reached ${ size } >= ${ this . params . sizeLimit } , stopping ` ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
interrupt = true ;
2022-09-20 17:09:52 -07:00
this . clearOnExit = true ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
}
}
if ( this . params . timeLimit ) {
const elapsed = ( Date . now ( ) - this . startTime ) / 1000 ;
if ( elapsed > this . params . timeLimit ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( ` Time threshold reached ${ elapsed } > ${ this . params . timeLimit } , stopping ` ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
interrupt = true ;
}
}
if ( interrupt ) {
2022-09-20 17:09:52 -07:00
this . gracefulFinish ( ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
}
}
2022-09-20 17:09:52 -07:00
gracefulFinish ( ) {
this . crawlState . setDrain ( true ) ;
this . interrupted = true ;
2023-03-08 21:31:19 -05:00
this . workerPool . interrupt ( ) ;
2022-09-20 17:09:52 -07:00
if ( ! this . params . waitOnDone ) {
this . finalExit = true ;
}
}
prepareForExit ( markDone = true ) {
if ( ! markDone ) {
this . params . waitOnDone = false ;
this . clearOnExit = true ;
2022-12-15 12:38:41 -05:00
this . logger . info ( "SIGNAL: Preparing for exit of this crawler instance only" ) ;
2022-09-20 17:09:52 -07:00
} else {
2022-12-15 12:38:41 -05:00
this . logger . info ( "SIGNAL: Preparing for final exit of all crawlers" ) ;
2022-09-20 17:09:52 -07:00
this . finalExit = true ;
}
}
async serializeAndExit ( ) {
await this . serializeConfig ( ) ;
process . exit ( 0 ) ;
}
2020-11-01 19:22:53 -08:00
async crawl ( ) {
2022-03-14 14:44:24 -07:00
this . profileDir = await loadProfile ( this . params . profile ) ;
2021-06-23 19:36:32 -07:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
if ( this . params . healthCheckPort ) {
2023-03-08 21:31:19 -05:00
this . healthChecker = new HealthChecker ( this . params . healthCheckPort , this . params . workers ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
}
2020-10-31 13:16:37 -07:00
try {
2022-10-24 15:30:10 +02:00
const driverUrl = new URL ( this . params . driver , import . meta . url ) ;
this . driver = ( await import ( driverUrl ) ) . default ;
2020-11-01 19:22:53 -08:00
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . warn ( ` Error importing driver ${ this . params . driver } ` , e ) ;
2020-11-01 19:22:53 -08:00
return ;
2020-10-31 13:16:37 -07:00
}
2022-06-30 19:24:26 -07:00
await this . initCrawlState ( ) ;
let initState = await this . crawlState . getStatus ( ) ;
while ( initState === "debug" ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( "Paused for debugging, will continue after manual resume" ) ;
2022-06-30 19:24:26 -07:00
await this . sleep ( 60 ) ;
initState = await this . crawlState . getStatus ( ) ;
}
2022-09-08 23:39:26 -07:00
// if already done, don't crawl anymore
if ( initState === "done" ) {
this . done = true ;
if ( this . params . waitOnDone ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( "Already done, waiting for signal to exit..." ) ;
2022-09-08 23:39:26 -07:00
// wait forever until signal
await new Promise ( ( ) => { } ) ;
}
return ;
}
2022-05-05 14:27:17 -05:00
if ( this . params . generateWACZ ) {
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
this . storage = initStorage ( ) ;
2021-11-23 12:53:30 -08:00
}
2023-03-08 21:31:19 -05:00
if ( initState === "finalize" ) {
await this . postCrawl ( ) ;
return ;
}
2022-06-30 19:24:26 -07:00
await this . crawlState . setStatus ( "running" ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if ( this . params . state ) {
await this . crawlState . load ( this . params . state , this . params . scopedSeeds , true ) ;
}
2021-04-29 14:34:56 -07:00
await this . initPages ( ) ;
2021-06-07 17:43:36 -07:00
2023-01-23 16:47:33 -08:00
this . adBlockRules = new AdBlockRules ( this . captureBasePrefix , this . params . adBlockMessage , this . logger ) ;
2022-10-25 10:53:32 -04:00
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
if ( this . params . blockRules && this . params . blockRules . length ) {
2022-12-15 12:38:41 -05:00
this . blockRules = new BlockRules ( this . params . blockRules , this . captureBasePrefix , this . params . blockMessage , this . logger ) ;
2021-07-19 15:49:43 -07:00
}
2022-02-23 12:09:48 -08:00
this . screencaster = this . initScreenCaster ( ) ;
2021-06-07 17:43:36 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
for ( let i = 0 ; i < this . params . scopedSeeds . length ; i ++ ) {
const seed = this . params . scopedSeeds [ i ] ;
2022-01-15 09:03:09 -08:00
if ( ! await this . queueUrl ( i , seed . url , 0 , 0 ) ) {
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if ( this . limitHit ) {
break ;
}
}
2021-06-23 19:36:32 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
if ( seed . sitemap ) {
await this . parseSitemap ( seed . sitemap , i ) ;
}
2020-11-14 21:55:02 +00:00
}
2023-03-08 21:31:19 -05:00
this . workerPool = new WorkerPool ( {
maxConcurrency : this . params . workers ,
puppeteerOptions : this . puppeteerArgs ,
crawlState : this . crawlState ,
screencaster : this . screencaster ,
healthChecker : this . healthChecker ,
task : ( opts ) => this . crawlPage ( opts )
} ) ;
await this . workerPool . work ( ) ;
await this . workerPool . close ( ) ;
2020-11-01 19:22:53 -08:00
2022-03-14 10:41:56 -07:00
await this . serializeConfig ( true ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
2021-04-29 14:34:56 -07:00
if ( this . pagesFH ) {
2021-05-21 15:37:02 -07:00
await this . pagesFH . sync ( ) ;
2021-04-29 14:34:56 -07:00
await this . pagesFH . close ( ) ;
}
2023-02-23 18:50:22 -08:00
await this . writeStats ( true ) ;
2020-11-01 19:22:53 -08:00
// extra wait for all resources to land into WARCs
2021-04-30 12:31:14 -07:00
await this . awaitPendingClear ( ) ;
2023-03-08 21:31:19 -05:00
await this . postCrawl ( ) ;
}
async postCrawl ( ) {
2021-03-31 13:41:27 -04:00
if ( this . params . combineWARC ) {
await this . combineWARC ( ) ;
}
2020-11-01 19:22:53 -08:00
2020-11-03 21:33:19 +00:00
if ( this . params . generateCDX ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( "Generating CDX" ) ;
await this . awaitProcess ( child _process . spawn ( "wb-manager" , [ "reindex" , this . params . collection ] , { cwd : this . params . cwd } ) ) ;
2020-10-31 13:16:37 -07:00
}
2021-06-23 19:36:32 -07:00
2023-02-24 18:31:08 -08:00
await this . closeLog ( ) ;
2022-09-20 17:09:52 -07:00
if ( this . params . generateWACZ && ( ! this . interrupted || this . finalExit || this . clearOnExit ) ) {
2022-02-08 15:31:55 -08:00
await this . generateWACZ ( ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
2022-09-20 17:09:52 -07:00
if ( this . clearOnExit ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( ` Clearing ${ this . collDir } before exit ` ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
try {
fs . rmSync ( this . collDir , { recursive : true , force : true } ) ;
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . warn ( ` Unable to clear ${ this . collDir } before exit ` , e ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
}
}
}
2022-09-20 17:09:52 -07:00
if ( this . params . waitOnDone && ( ! this . interrupted || this . finalExit ) ) {
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
this . done = true ;
2022-12-15 12:38:41 -05:00
this . logger . info ( "All done, waiting for signal..." ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
await this . crawlState . setStatus ( "done" ) ;
// wait forever until signal
await new Promise ( ( ) => { } ) ;
2022-02-08 15:31:55 -08:00
}
}
2022-01-26 16:06:10 -08:00
2023-02-24 18:31:08 -08:00
async closeLog ( ) {
// close file-based log
setExternalLogStream ( null ) ;
try {
await new Promise ( resolve => this . logFH . close ( ( ) => resolve ( ) ) ) ;
} catch ( e ) {
// ignore
}
}
2022-02-08 15:31:55 -08:00
async generateWACZ ( ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( "Generating WACZ" ) ;
2021-06-23 19:36:32 -07:00
2022-02-08 15:31:55 -08:00
const archiveDir = path . join ( this . collDir , "archive" ) ;
2021-11-23 12:53:30 -08:00
2022-02-08 15:31:55 -08:00
// Get a list of the warcs inside
const warcFileList = await fsp . readdir ( archiveDir ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
// is finished (>0 pages and all pages written)
const isFinished = await this . crawlState . isFinished ( ) ;
2022-12-15 12:38:41 -05:00
this . logger . info ( ` Num WARC Files: ${ warcFileList . length } ` ) ;
2022-02-08 15:31:55 -08:00
if ( ! warcFileList . length ) {
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
// if finished, just return
if ( isFinished ) {
return ;
}
2022-12-15 12:38:41 -05:00
this . logger . fatal ( "No WARC Files, assuming crawl failed" ) ;
2022-02-08 15:31:55 -08:00
}
// Build the argument list to pass to the wacz create command
const waczFilename = this . params . collection . concat ( ".wacz" ) ;
const waczPath = path . join ( this . collDir , waczFilename ) ;
2023-02-24 18:31:08 -08:00
const createArgs = [
"create" ,
"--split-seeds" ,
"-o" , waczPath ,
"--pages" , this . pagesFile ,
"--log-directory" , this . logDir
] ;
2022-02-08 15:31:55 -08:00
if ( process . env . WACZ _SIGN _URL ) {
createArgs . push ( "--signing-url" ) ;
createArgs . push ( process . env . WACZ _SIGN _URL ) ;
if ( process . env . WACZ _SIGN _TOKEN ) {
createArgs . push ( "--signing-token" ) ;
createArgs . push ( process . env . WACZ _SIGN _TOKEN ) ;
2021-11-23 12:53:30 -08:00
}
2021-02-04 00:28:32 -05:00
}
2022-02-08 15:31:55 -08:00
createArgs . push ( "-f" ) ;
warcFileList . forEach ( ( val , index ) => createArgs . push ( path . join ( archiveDir , val ) ) ) ; // eslint-disable-line no-unused-vars
// create WACZ
2022-12-15 12:38:41 -05:00
const waczResult = await this . awaitProcess ( child _process . spawn ( "wacz" , createArgs ) ) ;
2022-02-08 15:31:55 -08:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
if ( waczResult !== 0 ) {
2022-12-15 12:38:41 -05:00
this . logger . error ( "Error creating WACZ" , { "status code" : waczResult } ) ;
this . logger . fatal ( "Unable to write WACZ successfully" ) ;
2022-02-08 15:31:55 -08:00
}
2022-12-15 12:38:41 -05:00
this . logger . debug ( ` WACZ successfully generated and saved to: ${ waczPath } ` ) ;
2022-02-08 15:31:55 -08:00
// Verify WACZ
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
/ *
const validateArgs = [ "validate" ] ;
validateArgs . push ( "-f" ) ;
2022-02-08 15:31:55 -08:00
validateArgs . push ( waczPath ) ;
2022-12-15 12:38:41 -05:00
const waczVerifyResult = await this . awaitProcess ( child _process . spawn ( "wacz" , validateArgs ) ) ;
2022-02-08 15:31:55 -08:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
if ( waczVerifyResult !== 0 ) {
2022-02-08 15:31:55 -08:00
console . log ( "validate" , waczVerifyResult ) ;
2022-12-15 12:38:41 -05:00
this . logger . fatal ( "Unable to verify WACZ created successfully" ) ;
2022-02-08 15:31:55 -08:00
}
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
* /
2022-02-08 15:31:55 -08:00
if ( this . storage ) {
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
const filename = process . env . STORE _FILENAME || "@ts-@id.wacz" ;
const targetFilename = interpolateFilename ( filename , this . crawlId ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
await this . storage . uploadCollWACZ ( waczPath , targetFilename , isFinished ) ;
2022-02-08 15:31:55 -08:00
}
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
awaitProcess ( proc ) {
2022-12-15 12:38:41 -05:00
proc . stdout . on ( "data" , ( data ) => {
2023-03-08 21:31:19 -05:00
this . logger . debug ( data . toString ( ) ) ;
2022-12-15 12:38:41 -05:00
} ) ;
proc . stderr . on ( "data" , ( data ) => {
this . logger . error ( data . toString ( ) ) ;
} ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
return new Promise ( ( resolve ) => {
proc . on ( "close" , ( code ) => resolve ( code ) ) ;
} ) ;
}
2022-12-15 12:38:41 -05:00
async writeStats ( toFile = false ) {
if ( ! this . params . logging . includes ( "stats" ) ) {
return ;
}
2020-12-02 16:26:20 +00:00
2022-12-15 12:38:41 -05:00
const realSize = await this . crawlState . realSize ( ) ;
2023-01-23 10:43:12 -08:00
const pendingList = await this . crawlState . getPendingList ( ) ;
2022-12-15 12:38:41 -05:00
const done = await this . crawlState . numDone ( ) ;
2023-01-23 10:43:12 -08:00
const total = realSize + pendingList . length + done ;
2022-12-15 12:38:41 -05:00
const limit = { max : this . params . limit || 0 , hit : this . limitHit } ;
const stats = {
"crawled" : done ,
"total" : total ,
2023-01-23 10:43:12 -08:00
"pending" : pendingList . length ,
2022-12-15 12:38:41 -05:00
"limit" : limit ,
2023-01-23 10:43:12 -08:00
"pendingPages" : pendingList . map ( x => JSON . stringify ( x ) )
2022-12-15 12:38:41 -05:00
} ;
2023-02-23 18:50:22 -08:00
this . logger . info ( "Crawl statistics" , stats , "crawlStatus" ) ;
2022-12-15 12:38:41 -05:00
if ( toFile && this . params . statsFilename ) {
2020-12-02 16:26:20 +00:00
try {
2021-04-29 14:34:56 -07:00
await fsp . writeFile ( this . params . statsFilename , JSON . stringify ( stats , null , 2 ) ) ;
2020-12-02 16:26:20 +00:00
} catch ( err ) {
2022-12-15 12:38:41 -05:00
this . logger . warn ( "Stats output failed" , err ) ;
2020-12-02 16:26:20 +00:00
}
}
}
2021-07-23 18:31:43 -07:00
async loadPage ( page , urlData , selectorOptsList = DEFAULT _SELECTORS ) {
2022-01-15 09:03:09 -08:00
const { url , seedId , depth , extraHops = 0 } = urlData ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2023-03-08 21:31:19 -05:00
const logDetails = { page : url , workerid : page . _workerid } ;
2023-02-23 18:50:22 -08:00
2022-03-22 17:41:51 -07:00
let isHTMLPage = true ;
2023-03-10 23:11:24 -05:00
const isHTMLResult = await this . timedRun (
this . isHTML ( url ) ,
FETCH _TIMEOUT _SECS ,
"HEAD request to determine if URL is HTML page timed out" ,
logDetails
) ;
if ( isHTMLResult && ( isHTMLResult . value == false ) ) {
2022-03-22 17:41:51 -07:00
isHTMLPage = false ;
2021-07-23 18:31:43 -07:00
try {
2023-03-10 23:11:24 -05:00
const captureResult = await this . timedRun (
this . directFetchCapture ( url ) ,
FETCH _TIMEOUT _SECS ,
"Direct fetch capture attempt timed out" ,
logDetails
) ;
if ( captureResult . value ) {
2022-03-14 11:11:53 -07:00
return ;
}
2021-07-23 18:31:43 -07:00
} catch ( e ) {
// ignore failed direct fetch attempt, do browser-based capture
}
2021-05-21 15:37:02 -07:00
}
2023-01-23 16:47:33 -08:00
if ( this . adBlockRules && this . params . blockAds ) {
2022-10-25 10:53:32 -04:00
await this . adBlockRules . initPage ( page ) ;
}
2021-07-19 15:49:43 -07:00
if ( this . blockRules ) {
await this . blockRules . initPage ( page ) ;
}
2022-03-14 11:11:53 -07:00
let ignoreAbort = false ;
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
// if so, don't report as an error
2022-03-22 17:41:51 -07:00
page . once ( "requestfailed" , ( req ) => {
2022-03-14 14:41:39 -07:00
ignoreAbort = shouldIgnoreAbort ( req ) ;
2022-03-14 11:11:53 -07:00
} ) ;
2022-08-19 09:23:40 -07:00
// more serious page error, mark page session as invalid
page . on ( "error" , ( ) => this . markPageFailed ( page ) ) ;
2022-12-15 12:38:41 -05:00
page . on ( "console" , ( msg ) => {
if ( this . params . logging . includes ( "jserrors" ) && ( msg . type ( ) === "error" ) ) {
this . logger . warn ( msg . text ( ) , { "location" : msg . location ( ) } , "jsError" ) ;
}
} ) ;
2022-08-21 00:30:25 -07:00
2022-03-22 17:41:51 -07:00
const gotoOpts = isHTMLPage ? this . gotoOpts : "domcontentloaded" ;
2023-03-08 21:31:19 -05:00
this . logger . info ( "Awaiting page load" , logDetails ) ;
2021-05-21 15:37:02 -07:00
try {
2023-03-08 21:31:19 -05:00
const resp = await page . goto ( url , gotoOpts ) ;
isHTMLPage = this . isHTMLContentType ( resp . headers [ "content-type" ] ) ;
if ( this . healthChecker ) {
this . healthChecker . resetErrors ( ) ;
2022-05-19 06:24:12 +00:00
}
2021-05-21 15:37:02 -07:00
} catch ( e ) {
2022-03-14 11:11:53 -07:00
let msg = e . message || "" ;
if ( ! msg . startsWith ( "net::ERR_ABORTED" ) || ! ignoreAbort ) {
2023-02-23 18:50:22 -08:00
const mainMessage = e . name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error" ;
this . logger . error ( mainMessage , { msg , ... logDetails } ) ;
2023-03-08 21:31:19 -05:00
if ( this . healthChecker ) {
this . healthChecker . incError ( ) ;
}
2022-03-14 11:11:53 -07:00
}
2021-05-21 15:37:02 -07:00
}
2022-03-22 17:41:51 -07:00
page . isHTMLPage = isHTMLPage ;
2023-03-08 21:31:19 -05:00
if ( isHTMLPage ) {
page . _ _filteredFrames = page . frames ( ) . filter ( frame => this . shouldIncludeFrame ( frame , logDetails ) ) ;
} else {
page . _ _filteredFrames = null ;
}
2022-03-22 17:41:51 -07:00
if ( ! isHTMLPage ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Skipping link extraction for non-HTML page" , logDetails ) ;
2022-03-22 17:41:51 -07:00
return ;
}
2021-07-20 15:45:51 -07:00
const seed = this . params . scopedSeeds [ seedId ] ;
2023-02-23 18:50:22 -08:00
await this . checkCF ( page , logDetails ) ;
2022-03-18 10:32:59 -07:00
2023-02-23 18:50:22 -08:00
await this . netIdle ( page , logDetails ) ;
2022-09-20 17:09:52 -07:00
2021-07-20 15:45:51 -07:00
// skip extraction if at max depth
2021-07-23 18:31:43 -07:00
if ( seed . isAtMaxDepth ( depth ) || ! selectorOptsList ) {
2021-07-20 15:45:51 -07:00
return ;
}
2020-10-31 13:16:37 -07:00
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Extracting links" ) ;
2021-07-23 18:31:43 -07:00
for ( const opts of selectorOptsList ) {
2023-03-08 21:31:19 -05:00
const links = await this . extractLinks ( page , opts , logDetails ) ;
await this . queueInScopeUrls ( seedId , links , depth , extraHops , logDetails ) ;
2021-07-23 18:31:43 -07:00
}
}
2022-08-19 09:23:40 -07:00
async markPageFailed ( page ) {
page . _ _failed = true ;
2023-03-08 21:31:19 -05:00
if ( this . healthChecker ) {
this . healthChecker . incError ( ) ;
}
2022-08-19 09:23:40 -07:00
if ( this . screencaster ) {
await this . screencaster . endTarget ( page . target ( ) ) ;
}
}
2023-02-23 18:50:22 -08:00
async netIdle ( page , details ) {
2022-07-08 17:17:46 -07:00
if ( ! this . params . netIdleWait ) {
return ;
}
// in case page starts loading via fetch/xhr immediately after page load,
// we want to ensure we don't exit too early
await this . sleep ( 0.5 ) ;
try {
await page . waitForNetworkIdle ( { timeout : this . params . netIdleWait * 1000 } ) ;
} catch ( e ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "waitForNetworkIdle timed out, ignoring" , details ) ;
2022-07-08 17:17:46 -07:00
// ignore, continue
}
}
2023-03-08 21:31:19 -05:00
async extractLinks ( page , { selector = "a[href]" , extract = "href" , isAttribute = false } = { } , logDetails ) {
2021-07-23 18:31:43 -07:00
const results = [ ] ;
const loadProp = ( selector , extract ) => {
return [ ... document . querySelectorAll ( selector ) ] . map ( elem => elem [ extract ] ) ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
} ;
2021-07-23 18:31:43 -07:00
const loadAttr = ( selector , extract ) => {
return [ ... document . querySelectorAll ( selector ) ] . map ( elem => elem . getAttribute ( extract ) ) ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
} ;
const loadFunc = isAttribute ? loadAttr : loadProp ;
2020-10-31 13:16:37 -07:00
try {
2023-03-08 21:31:19 -05:00
const frames = page . _ _filteredFrames ;
const linkResults = await Promise . allSettled (
2023-03-10 23:11:24 -05:00
frames . map ( frame => this . timedRun (
frame . evaluate ( loadFunc , selector , extract ) ,
PAGE _OP _TIMEOUT _SECS ,
"Link extraction timed out" ,
logDetails ,
) )
2023-03-08 21:31:19 -05:00
) ;
2021-07-20 15:45:51 -07:00
if ( linkResults ) {
2023-03-08 21:31:19 -05:00
let i = 0 ;
2021-07-20 15:45:51 -07:00
for ( const linkResult of linkResults ) {
2023-03-08 21:31:19 -05:00
if ( ! linkResult ) {
this . logger . warn ( "Link Extraction timed out in frame" , { frameUrl : frames [ i ] . url , ... logDetails } ) ;
continue ;
}
2021-07-23 18:31:43 -07:00
if ( ! linkResult . value ) continue ;
2021-07-20 15:45:51 -07:00
for ( const link of linkResult . value ) {
results . push ( link ) ;
}
2023-03-08 21:31:19 -05:00
i ++ ;
2021-07-20 15:45:51 -07:00
}
}
2020-10-31 13:16:37 -07:00
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . warn ( "Link Extraction failed" , e ) ;
2020-10-31 13:16:37 -07:00
}
2021-07-23 18:31:43 -07:00
return results ;
2020-11-14 21:55:02 +00:00
}
2023-03-08 21:31:19 -05:00
async queueInScopeUrls ( seedId , urls , depth , extraHops = 0 , logDetails = { } ) {
2020-10-31 13:16:37 -07:00
try {
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
depth += 1 ;
2022-01-15 09:03:09 -08:00
// new number of extra hops, set if this hop is out-of-scope (oos)
const newExtraHops = extraHops + 1 ;
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
2022-01-15 09:03:09 -08:00
for ( const possibleUrl of urls ) {
2023-03-08 21:31:19 -05:00
const res = this . isInScope ( { url : possibleUrl , extraHops : newExtraHops , depth , seedId } , logDetails ) ;
2022-01-15 09:03:09 -08:00
if ( ! res ) {
continue ;
}
const { url , isOOS } = res ;
if ( url ) {
await this . queueUrl ( seedId , url , depth , isOOS ? newExtraHops : extraHops ) ;
2020-10-31 13:16:37 -07:00
}
}
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . error ( "Queuing Error" , e ) ;
2020-10-31 13:16:37 -07:00
}
}
2023-02-23 18:50:22 -08:00
async checkCF ( page , logDetails ) {
2022-03-18 10:32:59 -07:00
try {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Check CF Blocking" , logDetails ) ;
2023-03-10 23:11:24 -05:00
while ( await this . timedRun (
page . $ ( "div.cf-browser-verification.cf-im-under-attack" ) ,
PAGE _OP _TIMEOUT _SECS
) ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Cloudflare Check Detected, waiting for reload..." , logDetails ) ;
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
await this . sleep ( 5.5 ) ;
2022-03-18 10:32:59 -07:00
}
} catch ( e ) {
2023-02-23 18:50:22 -08:00
//this.logger.warn("Check CF failed, ignoring");
2022-03-18 10:32:59 -07:00
}
}
2022-01-15 09:03:09 -08:00
async queueUrl ( seedId , url , depth , extraHops = 0 ) {
2023-03-08 21:31:19 -05:00
this . logger . debug ( ` Queuing url ${ url } ` ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if ( this . limitHit ) {
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
return false ;
}
2021-11-23 12:53:30 -08:00
if ( this . params . limit > 0 && ( await this . crawlState . numRealSeen ( ) >= this . params . limit ) ) {
2021-01-29 18:26:55 +00:00
this . limitHit = true ;
2020-11-01 19:22:53 -08:00
return false ;
2020-10-31 13:16:37 -07:00
}
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if ( await this . crawlState . has ( url ) ) {
return false ;
}
await this . crawlState . add ( url ) ;
2022-01-15 09:03:09 -08:00
const urlData = { url , seedId , depth } ;
if ( extraHops ) {
urlData . extraHops = extraHops ;
}
2023-03-08 21:31:19 -05:00
await this . crawlState . push ( urlData ) ;
2020-11-01 19:22:53 -08:00
return true ;
2020-10-31 13:16:37 -07:00
}
2021-04-29 14:34:56 -07:00
async initPages ( ) {
2021-02-04 00:28:32 -05:00
try {
2021-04-29 14:34:56 -07:00
let createNew = false ;
2021-04-30 22:05:04 -04:00
// create pages dir if doesn't exist and write pages.jsonl header
if ( fs . existsSync ( this . pagesDir ) != true ) {
await fsp . mkdir ( this . pagesDir ) ;
2021-04-29 14:34:56 -07:00
createNew = true ;
}
2021-06-23 19:36:32 -07:00
2021-04-29 14:34:56 -07:00
this . pagesFH = await fsp . open ( this . pagesFile , "a" ) ;
if ( createNew ) {
2021-03-31 13:41:27 -04:00
const header = { "format" : "json-pages-1.0" , "id" : "pages" , "title" : "All Pages" } ;
2021-02-23 16:52:54 -05:00
if ( this . params . text ) {
2021-03-31 13:41:27 -04:00
header [ "hasText" ] = true ;
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Text Extraction: Enabled" ) ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
} else {
2021-03-31 13:41:27 -04:00
header [ "hasText" ] = false ;
2023-03-08 21:31:19 -05:00
this . logger . debug ( "Text Extraction: Disabled" ) ;
2021-02-23 16:52:54 -05:00
}
2021-03-31 13:41:27 -04:00
const header _formatted = JSON . stringify ( header ) . concat ( "\n" ) ;
2021-04-29 14:34:56 -07:00
await this . pagesFH . writeFile ( header _formatted ) ;
2021-02-04 00:28:32 -05:00
}
2021-04-29 14:34:56 -07:00
2021-02-04 00:28:32 -05:00
} catch ( err ) {
2022-12-15 12:38:41 -05:00
this . logger . error ( "pages/pages.jsonl creation failed" , err ) ;
2021-02-04 00:28:32 -05:00
}
}
2022-03-14 10:41:56 -07:00
async writePage ( { url , depth } , title , text ) {
2021-02-04 00:28:32 -05:00
const id = uuidv4 ( ) ;
const row = { "id" : id , "url" : url , "title" : title } ;
2021-02-23 16:52:54 -05:00
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if ( depth === 0 ) {
row . seed = true ;
}
2022-03-14 10:41:56 -07:00
if ( text !== null ) {
row . text = text ;
2021-02-23 16:52:54 -05:00
}
2021-06-23 19:36:32 -07:00
2022-03-14 10:41:56 -07:00
const processedRow = JSON . stringify ( row ) + "\n" ;
2021-02-04 00:28:32 -05:00
try {
2022-03-14 10:41:56 -07:00
await this . pagesFH . writeFile ( processedRow ) ;
} catch ( err ) {
2022-12-15 12:38:41 -05:00
this . logger . warn ( "pages/pages.jsonl append failed" , err ) ;
2021-02-04 00:28:32 -05:00
}
}
2021-06-23 19:36:32 -07:00
2020-11-01 19:22:53 -08:00
resolveAgent ( urlParsed ) {
return urlParsed . protocol === "https:" ? HTTPS _AGENT : HTTP _AGENT ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
async isHTML ( url ) {
try {
const resp = await fetch ( url , {
method : "HEAD" ,
headers : this . headers ,
agent : this . resolveAgent
} ) ;
2022-03-14 11:11:53 -07:00
if ( resp . status !== 200 ) {
2022-12-15 12:38:41 -05:00
this . logger . debug ( ` Skipping HEAD check ${ url } , invalid status ${ resp . status } ` ) ;
2021-03-13 16:48:31 -08:00
return true ;
2020-10-31 13:16:37 -07:00
}
2023-03-08 21:31:19 -05:00
return this . isHTMLContentType ( resp . headers . get ( "Content-Type" ) ) ;
2020-10-31 13:16:37 -07:00
2023-03-08 21:31:19 -05:00
} catch ( e ) {
// can't confirm not html, so try in browser
return true ;
}
}
2020-10-31 13:16:37 -07:00
2023-03-08 21:31:19 -05:00
isHTMLContentType ( contentType ) {
// just load if no content-type
if ( ! contentType ) {
return true ;
}
2020-10-31 13:16:37 -07:00
2023-03-08 21:31:19 -05:00
const mime = contentType . split ( ";" ) [ 0 ] ;
2020-10-31 13:16:37 -07:00
2023-03-08 21:31:19 -05:00
if ( HTML _TYPES . includes ( mime ) ) {
2020-10-31 13:16:37 -07:00
return true ;
2020-11-01 19:22:53 -08:00
}
2023-03-08 21:31:19 -05:00
return false ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
async directFetchCapture ( url ) {
//console.log(`Direct capture: ${this.capturePrefix}${url}`);
const abort = new AbortController ( ) ;
const signal = abort . signal ;
2022-03-14 11:11:53 -07:00
const resp = await fetch ( this . capturePrefix + url , { signal , headers : this . headers , redirect : "manual" } ) ;
2020-11-01 19:22:53 -08:00
abort . abort ( ) ;
2022-03-14 11:11:53 -07:00
return resp . status === 200 && ! resp . headers . get ( "set-cookie" ) ;
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
2021-04-30 12:31:14 -07:00
async awaitPendingClear ( ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( "Waiting to ensure pending data is written to WARCs..." ) ;
2021-04-30 12:31:14 -07:00
2021-11-23 12:53:30 -08:00
const redis = await initRedis ( "redis://localhost/0" ) ;
2021-04-30 12:31:14 -07:00
2023-02-23 18:50:22 -08:00
while ( ! this . interrupted ) {
2022-09-02 17:53:04 -07:00
try {
const count = Number ( await redis . get ( ` pywb: ${ this . params . collection } :pending ` ) || 0 ) ;
if ( count <= 0 ) {
break ;
}
2022-12-15 12:38:41 -05:00
this . logger . debug ( ` Still waiting for ${ count } pending requests to finish... ` ) ;
2022-09-02 17:53:04 -07:00
} catch ( e ) {
2021-04-30 12:31:14 -07:00
break ;
}
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
await this . sleep ( 1 ) ;
2021-04-30 12:31:14 -07:00
}
}
2023-03-10 23:11:24 -05:00
timedRun ( promise , seconds , message = "Promise timed out" , logDetails = { } , context = "general" ) {
// return Promise return value or log error if timeout is reached first
const timeout = seconds * 1000 ;
const rejectPromiseOnTimeout = ( timeout ) => {
return new Promise ( ( resolve , reject ) => {
setTimeout ( ( ) => ( reject ( "timeout reached" ) ) , timeout ) ;
} ) ;
} ;
return Promise . race ( [ promise , rejectPromiseOnTimeout ( timeout ) ] )
. catch ( ( ) => this . logger . error ( message , { "seconds" : seconds , ... logDetails } , context ) ) ;
}
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
sleep ( seconds ) {
return new Promise ( resolve => setTimeout ( resolve , seconds * 1000 ) ) ;
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
async parseSitemap ( url , seedId ) {
2020-11-14 21:55:02 +00:00
const sitemapper = new Sitemapper ( {
url ,
timeout : 15000 ,
requestHeaders : this . headers
} ) ;
try {
const { sites } = await sitemapper . fetch ( ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
await this . queueInScopeUrls ( seedId , sites , 0 ) ;
2020-11-14 21:55:02 +00:00
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . warn ( "Error fetching sites from sitemap" , e ) ;
2020-11-14 21:55:02 +00:00
}
}
2021-03-31 13:41:27 -04:00
async combineWARC ( ) {
2022-12-15 12:38:41 -05:00
this . logger . info ( "Generating Combined WARCs" ) ;
2021-03-31 13:41:27 -04:00
// Get the list of created Warcs
2021-04-29 14:34:56 -07:00
const warcLists = await fsp . readdir ( path . join ( this . collDir , "archive" ) ) ;
2022-12-15 12:38:41 -05:00
this . logger . debug ( ` Combining ${ warcLists . length } WARCs... ` ) ;
2021-03-31 13:41:27 -04:00
const fileSizeObjects = [ ] ; // Used to sort the created warc by fileSize
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
for ( let i = 0 ; i < warcLists . length ; i ++ ) {
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
const fileName = path . join ( this . collDir , "archive" , warcLists [ i ] ) ;
2022-02-08 15:31:55 -08:00
const fileSize = await getFileSize ( fileName ) ;
2021-03-31 13:41:27 -04:00
fileSizeObjects . push ( { "fileSize" : fileSize , "fileName" : fileName } ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
fileSizeObjects . sort ( ( a , b ) => b . fileSize - a . fileSize ) ;
2021-03-31 13:41:27 -04:00
}
const generatedCombinedWarcs = [ ] ;
// Used to name combined warcs, default to -1 for first increment
let combinedWarcNumber = - 1 ;
// write combine WARC to collection root
let combinedWarcFullPath = "" ;
2021-04-29 14:34:56 -07:00
// fileHandler
let fh = null ;
2021-03-31 13:41:27 -04:00
// Iterate through the sorted file size array.
for ( let j = 0 ; j < fileSizeObjects . length ; j ++ ) {
// if need to rollover to new warc
let doRollover = false ;
// set to true for first warc
if ( combinedWarcNumber < 0 ) {
doRollover = true ;
} else {
// Check the size of the existing combined warc.
2022-02-08 15:31:55 -08:00
const currentCombinedWarcSize = await getFileSize ( combinedWarcFullPath ) ;
2021-03-31 13:41:27 -04:00
// If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
const proposedWarcSize = fileSizeObjects [ j ] . fileSize + currentCombinedWarcSize ;
doRollover = ( proposedWarcSize >= this . params . rolloverSize ) ;
}
if ( doRollover ) {
2021-06-23 19:36:32 -07:00
// If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
2021-03-31 13:41:27 -04:00
// 1. increment the combinedWarcNumber
// 2. create the name of the new combinedWarcFile
// 3. Write the header out to the new file
// 4. Write out the current warc data to the combinedFile
combinedWarcNumber = combinedWarcNumber + 1 ;
2021-04-29 14:34:56 -07:00
const combinedWarcName = ` ${ this . params . collection } _ ${ combinedWarcNumber } .warc.gz ` ;
2021-03-31 13:41:27 -04:00
// write combined warcs to root collection dir as they're output of a collection (like wacz)
combinedWarcFullPath = path . join ( this . collDir , combinedWarcName ) ;
2021-04-29 14:34:56 -07:00
if ( fh ) {
fh . end ( ) ;
}
fh = fs . createWriteStream ( combinedWarcFullPath , { flags : "a" } ) ;
2021-03-31 13:41:27 -04:00
generatedCombinedWarcs . push ( combinedWarcName ) ;
const warcBuffer = await this . createWARCInfo ( combinedWarcName ) ;
2021-04-29 14:34:56 -07:00
fh . write ( warcBuffer ) ;
2021-03-31 13:41:27 -04:00
}
2022-12-15 12:38:41 -05:00
this . logger . debug ( ` Appending WARC ${ fileSizeObjects [ j ] . fileName } ` ) ;
2021-04-29 14:34:56 -07:00
const reader = fs . createReadStream ( fileSizeObjects [ j ] . fileName ) ;
const p = new Promise ( ( resolve ) => {
reader . on ( "end" , ( ) => resolve ( ) ) ;
} ) ;
reader . pipe ( fh , { end : false } ) ;
await p ;
}
if ( fh ) {
await fh . end ( ) ;
2021-03-31 13:41:27 -04:00
}
2022-12-15 12:38:41 -05:00
this . logger . debug ( ` Combined WARCs saved as: ${ generatedCombinedWarcs } ` ) ;
2021-03-31 13:41:27 -04:00
}
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
2022-03-14 10:41:56 -07:00
async serializeConfig ( done = false ) {
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
switch ( this . params . saveState ) {
case "never" :
return ;
case "partial" :
2022-03-14 10:41:56 -07:00
if ( ! done ) {
return ;
}
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
if ( await this . crawlState . isFinished ( ) ) {
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
return ;
}
break ;
case "always" :
default :
break ;
}
2022-03-14 10:41:56 -07:00
const now = new Date ( ) ;
if ( ! done ) {
// if not done, save state only after specified interval has elapsed
if ( ( now . getTime ( ) - this . lastSaveTime ) < this . saveStateInterval ) {
return ;
}
}
this . lastSaveTime = now . getTime ( ) ;
const ts = now . toISOString ( ) . slice ( 0 , 19 ) . replace ( /[T:-]/g , "" ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
const crawlDir = path . join ( this . collDir , "crawls" ) ;
await fsp . mkdir ( crawlDir , { recursive : true } ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
const filenameOnly = ` crawl- ${ ts } - ${ this . params . crawlId } .yaml ` ;
const filename = path . join ( crawlDir , filenameOnly ) ;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
const state = await this . crawlState . serialize ( ) ;
if ( this . origConfig ) {
this . origConfig . state = state ;
}
const res = yaml . dump ( this . origConfig , { lineWidth : - 1 } ) ;
2022-03-14 10:41:56 -07:00
try {
2022-12-15 12:38:41 -05:00
this . logger . info ( ` Saving crawl state to: ${ filename } ` ) ;
2022-03-14 10:41:56 -07:00
await fsp . writeFile ( filename , res ) ;
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . error ( ` Failed to write save state file: ${ filename } ` , e ) ;
2022-03-14 10:41:56 -07:00
return ;
}
this . saveStateFiles . push ( filename ) ;
if ( this . saveStateFiles . length > this . params . saveStateHistory ) {
const oldFilename = this . saveStateFiles . shift ( ) ;
2022-12-15 12:38:41 -05:00
this . logger . info ( ` Removing old save-state: ${ oldFilename } ` ) ;
2022-03-14 10:41:56 -07:00
try {
await fsp . unlink ( oldFilename ) ;
} catch ( e ) {
2022-12-15 12:38:41 -05:00
this . logger . error ( ` Failed to delete old save state file: ${ oldFilename } ` ) ;
2022-03-14 10:41:56 -07:00
}
}
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
if ( this . storage && done && this . params . saveState === "always" ) {
const targetFilename = interpolateFilename ( filenameOnly , this . crawlId ) ;
await this . storage . uploadFile ( filename , targetFilename ) ;
}
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
}
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
2022-03-14 14:41:39 -07:00
function shouldIgnoreAbort ( req ) {
try {
const failure = req . failure ( ) && req . failure ( ) . errorText ;
if ( failure !== "net::ERR_ABORTED" || req . resourceType ( ) !== "document" ) {
return false ;
}
const resp = req . response ( ) ;
const headers = resp && resp . headers ( ) ;
if ( ! headers ) {
return false ;
}
if ( headers [ "content-disposition" ] ||
( headers [ "content-type" ] && ! headers [ "content-type" ] . startsWith ( "text/" ) ) ) {
return true ;
}
} catch ( e ) {
return false ;
}
}