2020-10-31 13:16:37 -07:00
|
|
|
const child_process = require("child_process");
|
2020-11-01 19:22:53 -08:00
|
|
|
const path = require("path");
|
2020-12-02 16:26:20 +00:00
|
|
|
const fs = require("fs");
|
2021-11-23 12:53:30 -08:00
|
|
|
const os = require("os");
|
2021-04-29 14:34:56 -07:00
|
|
|
const fsp = require("fs/promises");
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
// to ignore HTTPS error for HEAD check
|
|
|
|
const HTTPS_AGENT = require("https").Agent({
|
|
|
|
rejectUnauthorized: false,
|
|
|
|
});
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
const HTTP_AGENT = require("http").Agent();
|
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
const fetch = require("node-fetch");
|
|
|
|
const puppeteer = require("puppeteer-core");
|
|
|
|
const { Cluster } = require("puppeteer-cluster");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const { RedisCrawlState, MemoryCrawlState } = require("./util/state");
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
const AbortController = require("abort-controller");
|
|
|
|
const Sitemapper = require("sitemapper");
|
|
|
|
const { v4: uuidv4 } = require("uuid");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const yaml = require("js-yaml");
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
|
|
|
const warcio = require("warcio");
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2021-06-23 19:36:32 -07:00
|
|
|
const TextExtract = require("./util/textextract");
|
2022-02-08 15:31:55 -08:00
|
|
|
const { S3StorageSync, getFileSize } = require("./util/storage");
|
2022-02-23 12:09:48 -08:00
|
|
|
const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/screencaster");
|
2021-06-23 19:36:32 -07:00
|
|
|
const { parseArgs } = require("./util/argParser");
|
2021-11-23 12:53:30 -08:00
|
|
|
const { initRedis } = require("./util/redis");
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2022-02-20 22:22:19 -08:00
|
|
|
const { getBrowserExe, loadProfile, evaluateWithCLI } = require("./util/browser");
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
const { BlockRules } = require("./util/blockrules");
|
|
|
|
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// ============================================================================
|
|
|
|
class Crawler {
|
|
|
|
constructor() {
|
2020-11-03 17:16:29 +00:00
|
|
|
this.headers = {};
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
this.crawlState = null;
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
this.emulateDevice = null;
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// pages file
|
|
|
|
this.pagesFH = null;
|
|
|
|
|
2021-01-29 18:26:55 +00:00
|
|
|
// was the limit hit?
|
|
|
|
this.limitHit = false;
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
this.userAgent = "";
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const res = parseArgs();
|
|
|
|
this.params = res.parsed;
|
|
|
|
this.origConfig = res.origConfig;
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
this.saveStateFiles = [];
|
|
|
|
this.lastSaveTime = 0;
|
|
|
|
this.saveStateInterval = this.params.saveStateInterval * 1000;
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.debugLogging = this.params.logging.includes("debug");
|
|
|
|
|
|
|
|
this.profileDir = loadProfile(this.params.profile);
|
|
|
|
|
|
|
|
if (this.params.profile) {
|
|
|
|
this.statusLog("With Browser Profile: " + this.params.profile);
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
this.emulateDevice = this.params.emulateDevice;
|
2020-11-01 19:22:53 -08:00
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.debugLog("Seeds", this.params.scopedSeeds);
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
2021-07-23 18:31:43 -07:00
|
|
|
this.capturePrefix = this.captureBasePrefix + "/id_/";
|
2021-02-04 00:28:32 -05:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
this.gotoOpts = {
|
|
|
|
waitUntil: this.params.waitUntil,
|
|
|
|
timeout: this.params.timeout
|
|
|
|
};
|
2021-02-04 00:28:32 -05:00
|
|
|
|
|
|
|
// root collections dir
|
|
|
|
this.collDir = path.join(this.params.cwd, "collections", this.params.collection);
|
|
|
|
|
|
|
|
// pages directory
|
|
|
|
this.pagesDir = path.join(this.collDir, "pages");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
// pages file
|
|
|
|
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
this.blockRules = null;
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
statusLog(...args) {
|
|
|
|
console.log(...args);
|
|
|
|
}
|
|
|
|
|
|
|
|
debugLog(...args) {
|
|
|
|
if (this.debugLogging) {
|
|
|
|
console.log(...args);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
configureUA() {
|
|
|
|
// override userAgent
|
|
|
|
if (this.params.userAgent) {
|
|
|
|
|
|
|
|
if (this.emulateDevice) {
|
|
|
|
this.emulateDevice.userAgent = this.params.userAgent;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.userAgent = this.params.userAgent;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.browserExe = getBrowserExe();
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
// if device set, it overrides the default Chrome UA
|
|
|
|
if (this.emulateDevice) {
|
|
|
|
this.userAgent = this.emulateDevice.userAgent;
|
|
|
|
} else {
|
2021-02-03 22:24:38 -08:00
|
|
|
let version = process.env.BROWSER_VERSION;
|
2020-11-14 19:32:31 +00:00
|
|
|
|
|
|
|
try {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
version = child_process.execFileSync(this.browserExe, ["--version"], {encoding: "utf8"});
|
|
|
|
version = version.match(/[\d.]+/)[0];
|
2021-02-04 00:28:32 -05:00
|
|
|
} catch(e) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
console.error(e);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2020-11-14 19:32:31 +00:00
|
|
|
|
|
|
|
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
|
|
|
}
|
|
|
|
|
|
|
|
// suffix to append to default userAgent
|
|
|
|
if (this.params.userAgentSuffix) {
|
|
|
|
this.userAgent += " " + this.params.userAgentSuffix;
|
|
|
|
|
|
|
|
if (this.emulateDevice) {
|
|
|
|
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
async initCrawlState() {
|
|
|
|
const redisUrl = this.params.redisStoreUrl;
|
|
|
|
|
|
|
|
if (redisUrl) {
|
|
|
|
if (!redisUrl.startsWith("redis://")) {
|
|
|
|
throw new Error("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
|
|
|
|
}
|
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
let redis;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
try {
|
2021-11-23 12:53:30 -08:00
|
|
|
redis = await initRedis(redisUrl);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
} catch (e) {
|
|
|
|
throw new Error("Unable to connect to state store Redis: " + redisUrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.params.crawlId}"`);
|
|
|
|
|
|
|
|
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout);
|
2022-02-23 12:09:48 -08:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
} else {
|
|
|
|
this.statusLog("Storing state in memory");
|
|
|
|
|
|
|
|
this.crawlState = new MemoryCrawlState();
|
|
|
|
}
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
|
|
|
this.statusLog(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`);
|
|
|
|
}
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
return this.crawlState;
|
|
|
|
}
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
initScreenCaster() {
|
|
|
|
let transport;
|
|
|
|
|
|
|
|
if (this.params.screencastPort) {
|
|
|
|
transport = new WSTransport(this.params.screencastPort);
|
|
|
|
this.debugLog(`Screencast server started on: ${this.params.screencastPort}`);
|
|
|
|
} else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
|
|
|
const crawlId = process.env.CRAWL_ID || os.hostname();
|
|
|
|
transport = new RedisPubSubTransport(this.params.redisStoreUrl, crawlId);
|
|
|
|
this.debugLog("Screencast enabled via redis pubsub");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!transport) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
return new ScreenCaster(transport, this.params.workers);
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
bootstrap() {
|
2021-03-31 13:41:27 -04:00
|
|
|
let opts = {};
|
2021-03-13 16:48:31 -08:00
|
|
|
if (this.params.logging.includes("pywb")) {
|
2021-03-04 15:36:58 -05:00
|
|
|
opts = {stdio: "inherit", cwd: this.params.cwd};
|
|
|
|
}
|
|
|
|
else{
|
|
|
|
opts = {stdio: "ignore", cwd: this.params.cwd};
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2020-11-14 20:51:07 +00:00
|
|
|
this.configureUA();
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2020-11-03 17:16:29 +00:00
|
|
|
this.headers = {"User-Agent": this.userAgent};
|
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
const subprocesses = [];
|
|
|
|
|
|
|
|
subprocesses.push(child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}));
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts));
|
|
|
|
|
|
|
|
process.on("exit", () => {
|
|
|
|
for (const proc of subprocesses) {
|
|
|
|
proc.kill();
|
|
|
|
}
|
|
|
|
});
|
2020-11-01 19:22:53 -08:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (!this.params.headless && !process.env.NO_XVFB) {
|
2020-11-01 19:22:53 -08:00
|
|
|
child_process.spawn("Xvfb", [
|
|
|
|
process.env.DISPLAY,
|
|
|
|
"-listen",
|
|
|
|
"tcp",
|
|
|
|
"-screen",
|
|
|
|
"0",
|
|
|
|
process.env.GEOMETRY,
|
|
|
|
"-ac",
|
|
|
|
"+extension",
|
|
|
|
"RANDR"
|
|
|
|
]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
get chromeArgs() {
|
|
|
|
// Chrome Flags, including proxy server
|
|
|
|
return [
|
2022-01-26 12:56:35 -08:00
|
|
|
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
2020-11-01 19:22:53 -08:00
|
|
|
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
|
|
|
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
|
|
|
"--no-sandbox",
|
|
|
|
"--disable-background-media-suspend",
|
|
|
|
"--autoplay-policy=no-user-gesture-required",
|
2021-01-29 00:33:01 -08:00
|
|
|
"--disable-features=IsolateOrigins,site-per-process",
|
2021-05-21 15:37:02 -07:00
|
|
|
"--disable-popup-blocking",
|
|
|
|
"--disable-backgrounding-occluded-windows",
|
2020-11-01 19:22:53 -08:00
|
|
|
];
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
get puppeteerArgs() {
|
|
|
|
// Puppeter Options
|
|
|
|
return {
|
|
|
|
headless: this.params.headless,
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
executablePath: this.browserExe,
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
handleSIGINT: false,
|
|
|
|
handleSIGTERM: false,
|
|
|
|
handleSIGHUP: false,
|
2020-11-01 19:22:53 -08:00
|
|
|
ignoreHTTPSErrors: true,
|
2021-04-10 13:08:22 -07:00
|
|
|
args: this.chromeArgs,
|
|
|
|
userDataDir: this.profileDir,
|
|
|
|
defaultViewport: null,
|
2020-11-01 19:22:53 -08:00
|
|
|
};
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async run() {
|
2021-08-17 20:54:18 -07:00
|
|
|
await fsp.mkdir(this.params.cwd, {recursive: true});
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
this.bootstrap();
|
2020-10-31 13:16:37 -07:00
|
|
|
|
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
await this.crawl();
|
|
|
|
process.exit(0);
|
2020-10-31 13:16:37 -07:00
|
|
|
} catch(e) {
|
2020-11-01 19:22:53 -08:00
|
|
|
console.error("Crawl failed");
|
|
|
|
console.error(e);
|
|
|
|
process.exit(1);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2021-04-10 13:08:22 -07:00
|
|
|
|
|
|
|
_behaviorLog({data, type}) {
|
|
|
|
switch (type) {
|
|
|
|
case "info":
|
|
|
|
console.log(JSON.stringify(data));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "debug":
|
|
|
|
default:
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
if (this.params.behaviorsLogDebug) {
|
2021-04-10 13:08:22 -07:00
|
|
|
console.log("behavior debug: " + JSON.stringify(data));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
async crawlPage({page, data}) {
|
|
|
|
try {
|
2021-06-07 17:43:36 -07:00
|
|
|
if (this.screencaster) {
|
|
|
|
await this.screencaster.newTarget(page.target());
|
|
|
|
}
|
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
if (this.emulateDevice) {
|
|
|
|
await page.emulate(this.emulateDevice);
|
|
|
|
}
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
if (this.params.profile) {
|
2021-07-20 15:45:51 -07:00
|
|
|
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
|
|
|
}
|
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
if (this.params.behaviorOpts && !page.__bx_inited) {
|
2021-04-10 13:08:22 -07:00
|
|
|
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
2021-06-07 17:43:36 -07:00
|
|
|
page.__bx_inited = true;
|
2021-03-13 16:48:31 -08:00
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
|
|
|
// run custom driver here
|
|
|
|
await this.driver({page, data, crawler: this});
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
const title = await page.title();
|
2021-03-31 13:41:27 -04:00
|
|
|
let text = "";
|
2021-03-13 16:48:31 -08:00
|
|
|
if (this.params.text) {
|
2021-02-23 16:52:54 -05:00
|
|
|
const client = await page.target().createCDPSession();
|
|
|
|
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
2021-03-04 15:36:58 -05:00
|
|
|
text = await new TextExtract(result).parseTextFromDom();
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.writePage(data, title, this.params.text ? text : null);
|
2021-02-08 22:21:34 -08:00
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
if (this.params.behaviorOpts) {
|
2022-02-20 22:22:19 -08:00
|
|
|
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
await this.writeStats();
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.serializeConfig();
|
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
} catch (e) {
|
|
|
|
console.warn(e);
|
2022-03-02 13:26:11 -08:00
|
|
|
} finally {
|
|
|
|
|
|
|
|
try {
|
|
|
|
if (this.screencaster) {
|
|
|
|
await this.screencaster.endTarget(page.target());
|
|
|
|
}
|
|
|
|
} catch (e) {
|
|
|
|
console.warn(e);
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
async createWARCInfo(filename) {
|
2021-07-07 18:56:52 -04:00
|
|
|
const warcVersion = "WARC/1.0";
|
2021-03-31 13:41:27 -04:00
|
|
|
const type = "warcinfo";
|
2021-04-29 14:34:56 -07:00
|
|
|
const packageFileJSON = JSON.parse(await fsp.readFile("../app/package.json"));
|
2021-07-07 18:56:52 -04:00
|
|
|
const warcioPackageJSON = JSON.parse(await fsp.readFile("/app/node_modules/warcio/package.json"));
|
2021-06-24 15:39:17 -07:00
|
|
|
const pywbVersion = child_process.execSync("pywb -V", {encoding: "utf8"}).trim().split(" ")[1];
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
const info = {
|
2021-07-07 18:56:52 -04:00
|
|
|
"software": `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version} pywb ${pywbVersion})`,
|
|
|
|
"format": "WARC File Format 1.0"
|
2021-03-31 13:41:27 -04:00
|
|
|
};
|
2021-07-07 18:56:52 -04:00
|
|
|
|
|
|
|
const warcInfo = {...info, ...this.params.warcInfo, };
|
|
|
|
const record = await warcio.WARCRecord.createWARCInfo({filename, type, warcVersion}, warcInfo);
|
2021-03-31 13:41:27 -04:00
|
|
|
const buffer = await warcio.WARCSerializer.serialize(record, {gzip: true});
|
|
|
|
return buffer;
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async crawl() {
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
this.driver = require(this.params.driver);
|
|
|
|
} catch(e) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
console.warn(e);
|
2020-11-01 19:22:53 -08:00
|
|
|
return;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
if (this.params.generateWACZ && process.env.STORE_ENDPOINT_URL) {
|
|
|
|
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
|
|
|
const storeInfo = {
|
|
|
|
endpointUrl,
|
|
|
|
accessKey: process.env.STORE_ACCESS_KEY,
|
|
|
|
secretKey: process.env.STORE_SECRET_KEY,
|
|
|
|
};
|
|
|
|
|
|
|
|
const opts = {
|
|
|
|
crawlId: process.env.CRAWL_ID || os.hostname(),
|
|
|
|
webhookUrl: process.env.WEBHOOK_URL,
|
|
|
|
userId: process.env.STORE_USER,
|
|
|
|
filename: process.env.STORE_FILENAME || "@ts-@id.wacz",
|
|
|
|
};
|
|
|
|
|
|
|
|
console.log("Initing Storage...");
|
|
|
|
this.storage = new S3StorageSync(storeInfo, opts);
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// Puppeteer Cluster init and options
|
|
|
|
this.cluster = await Cluster.launch({
|
|
|
|
concurrency: this.params.newContext,
|
|
|
|
maxConcurrency: this.params.workers,
|
2021-10-27 20:49:37 -07:00
|
|
|
skipDuplicateUrls: false,
|
2020-11-01 19:22:53 -08:00
|
|
|
timeout: this.params.timeout * 2,
|
|
|
|
puppeteerOptions: this.puppeteerArgs,
|
|
|
|
puppeteer,
|
2021-03-13 16:48:31 -08:00
|
|
|
monitor: this.params.logging.includes("stats")
|
2020-11-01 19:22:53 -08:00
|
|
|
});
|
2020-10-31 13:16:37 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
this.cluster.jobQueue = await this.initCrawlState();
|
|
|
|
|
|
|
|
if (this.params.state) {
|
|
|
|
await this.crawlState.load(this.params.state, this.params.scopedSeeds, true);
|
|
|
|
}
|
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
this.cluster.task((opts) => this.crawlPage(opts));
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
await this.initPages();
|
2021-06-07 17:43:36 -07:00
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
if (this.params.blockRules && this.params.blockRules.length) {
|
2021-07-27 09:41:21 -07:00
|
|
|
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
this.screencaster = this.initScreenCaster();
|
2021-06-07 17:43:36 -07:00
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
|
|
|
const seed = this.params.scopedSeeds[i];
|
2022-01-15 09:03:09 -08:00
|
|
|
if (!await this.queueUrl(i, seed.url, 0, 0)) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (this.limitHit) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
if (seed.sitemap) {
|
|
|
|
await this.parseSitemap(seed.sitemap, i);
|
|
|
|
}
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
await this.cluster.idle();
|
|
|
|
await this.cluster.close();
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.serializeConfig(true);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2020-12-02 16:26:20 +00:00
|
|
|
this.writeStats();
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
if (this.pagesFH) {
|
2021-05-21 15:37:02 -07:00
|
|
|
await this.pagesFH.sync();
|
2021-04-29 14:34:56 -07:00
|
|
|
await this.pagesFH.close();
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// extra wait for all resources to land into WARCs
|
2021-04-30 12:31:14 -07:00
|
|
|
await this.awaitPendingClear();
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
if (this.params.combineWARC) {
|
|
|
|
await this.combineWARC();
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2020-11-03 21:33:19 +00:00
|
|
|
if (this.params.generateCDX) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.statusLog("Generating CDX");
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2020-11-02 15:28:19 +00:00
|
|
|
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
|
|
|
if (this.params.generateWACZ) {
|
2022-02-08 15:31:55 -08:00
|
|
|
await this.generateWACZ();
|
|
|
|
}
|
|
|
|
}
|
2022-01-26 16:06:10 -08:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
async generateWACZ() {
|
|
|
|
this.statusLog("Generating WACZ");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
const archiveDir = path.join(this.collDir, "archive");
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// Get a list of the warcs inside
|
|
|
|
const warcFileList = await fsp.readdir(archiveDir);
|
|
|
|
|
|
|
|
console.log(`Num WARC Files: ${warcFileList.length}`);
|
|
|
|
if (!warcFileList.length) {
|
|
|
|
throw new Error("No WARC Files, assuming crawl failed");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build the argument list to pass to the wacz create command
|
|
|
|
const waczFilename = this.params.collection.concat(".wacz");
|
|
|
|
const waczPath = path.join(this.collDir, waczFilename);
|
|
|
|
|
|
|
|
const createArgs = ["create", "--split-seeds", "-o", waczPath, "--pages", this.pagesFile];
|
|
|
|
const validateArgs = ["validate"];
|
|
|
|
|
|
|
|
if (process.env.WACZ_SIGN_URL) {
|
|
|
|
createArgs.push("--signing-url");
|
|
|
|
createArgs.push(process.env.WACZ_SIGN_URL);
|
|
|
|
if (process.env.WACZ_SIGN_TOKEN) {
|
|
|
|
createArgs.push("--signing-token");
|
|
|
|
createArgs.push(process.env.WACZ_SIGN_TOKEN);
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
createArgs.push("-f");
|
|
|
|
validateArgs.push("-f");
|
|
|
|
|
|
|
|
warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
|
|
|
|
|
|
|
// create WACZ
|
|
|
|
const waczResult = child_process.spawnSync("wacz" , createArgs, {stdio: "inherit"});
|
|
|
|
|
|
|
|
if (waczResult.status !== 0) {
|
|
|
|
console.log("create result", waczResult);
|
|
|
|
throw new Error("Unable to write WACZ successfully");
|
|
|
|
}
|
|
|
|
|
|
|
|
this.debugLog(`WACZ successfully generated and saved to: ${waczPath}`);
|
|
|
|
|
|
|
|
// Verify WACZ
|
|
|
|
validateArgs.push(waczPath);
|
|
|
|
|
|
|
|
const waczVerifyResult = child_process.spawnSync("wacz", validateArgs, {stdio: "inherit"});
|
|
|
|
|
|
|
|
if (waczVerifyResult.status !== 0) {
|
|
|
|
console.log("validate", waczVerifyResult);
|
|
|
|
throw new Error("Unable to verify WACZ created successfully");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.storage) {
|
|
|
|
const finished = await this.crawlState.finished();
|
|
|
|
await this.storage.uploadCollWACZ(waczPath, finished);
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
async writeStats() {
|
2020-12-02 16:26:20 +00:00
|
|
|
if (this.params.statsFilename) {
|
|
|
|
const total = this.cluster.allTargetCount;
|
|
|
|
const workersRunning = this.cluster.workersBusy.length;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const numCrawled = total - (await this.cluster.jobQueue.size()) - workersRunning;
|
2021-01-29 18:26:55 +00:00
|
|
|
const limit = {max: this.params.limit || 0, hit: this.limitHit};
|
|
|
|
const stats = {numCrawled, workersRunning, total, limit};
|
2020-12-02 16:26:20 +00:00
|
|
|
|
|
|
|
try {
|
2021-04-29 14:34:56 -07:00
|
|
|
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
2020-12-02 16:26:20 +00:00
|
|
|
} catch (err) {
|
|
|
|
console.warn("Stats output failed", err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
|
2022-01-15 09:03:09 -08:00
|
|
|
const {url, seedId, depth, extraHops = 0} = urlData;
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
if (!await this.isHTML(url)) {
|
2021-07-23 18:31:43 -07:00
|
|
|
try {
|
2022-03-14 11:11:53 -07:00
|
|
|
if (await this.directFetchCapture(url)) {
|
|
|
|
return;
|
|
|
|
}
|
2021-07-23 18:31:43 -07:00
|
|
|
} catch (e) {
|
|
|
|
// ignore failed direct fetch attempt, do browser-based capture
|
|
|
|
}
|
2021-05-21 15:37:02 -07:00
|
|
|
}
|
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
if (this.blockRules) {
|
|
|
|
await this.blockRules.initPage(page);
|
|
|
|
}
|
|
|
|
|
2022-03-14 11:11:53 -07:00
|
|
|
let ignoreAbort = false;
|
|
|
|
|
|
|
|
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
|
|
|
|
// if so, don't report as an error
|
|
|
|
page.on("requestfailed", (req) => {
|
|
|
|
const failure = req.failure().errorText;
|
|
|
|
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const resp = req.response();
|
|
|
|
const headers = resp && resp.headers();
|
|
|
|
|
|
|
|
if (!headers) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (headers["content-disposition"] ||
|
|
|
|
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
|
|
|
ignoreAbort = true;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
try {
|
|
|
|
await page.goto(url, this.gotoOpts);
|
|
|
|
} catch (e) {
|
2022-03-14 11:11:53 -07:00
|
|
|
let msg = e.message || "";
|
|
|
|
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
|
|
|
this.statusLog(`ERROR: ${url}: ${msg}`);
|
|
|
|
}
|
2021-05-21 15:37:02 -07:00
|
|
|
}
|
|
|
|
|
2021-07-20 15:45:51 -07:00
|
|
|
const seed = this.params.scopedSeeds[seedId];
|
|
|
|
|
|
|
|
// skip extraction if at max depth
|
2021-07-23 18:31:43 -07:00
|
|
|
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
2021-07-20 15:45:51 -07:00
|
|
|
return;
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
for (const opts of selectorOptsList) {
|
|
|
|
const links = await this.extractLinks(page, opts);
|
2022-01-15 09:03:09 -08:00
|
|
|
await this.queueInScopeUrls(seedId, links, depth, extraHops);
|
2021-07-23 18:31:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async extractLinks(page, {selector = "a[href]", extract = "href", isAttribute = false} = {}) {
|
|
|
|
const results = [];
|
|
|
|
|
|
|
|
const loadProp = (selector, extract) => {
|
|
|
|
return [...document.querySelectorAll(selector)].map(elem => elem[extract]);
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
};
|
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
const loadAttr = (selector, extract) => {
|
|
|
|
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(extract));
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
const loadFunc = isAttribute ? loadAttr : loadProp;
|
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
2021-07-23 18:31:43 -07:00
|
|
|
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, extract)));
|
2021-07-20 15:45:51 -07:00
|
|
|
|
|
|
|
if (linkResults) {
|
|
|
|
for (const linkResult of linkResults) {
|
2021-07-23 18:31:43 -07:00
|
|
|
if (!linkResult.value) continue;
|
2021-07-20 15:45:51 -07:00
|
|
|
for (const link of linkResult.value) {
|
|
|
|
results.push(link);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
} catch (e) {
|
|
|
|
console.warn("Link Extraction failed", e);
|
|
|
|
}
|
2021-07-23 18:31:43 -07:00
|
|
|
return results;
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
async queueInScopeUrls(seedId, urls, depth, extraHops = 0) {
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
depth += 1;
|
|
|
|
const seed = this.params.scopedSeeds[seedId];
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
// new number of extra hops, set if this hop is out-of-scope (oos)
|
|
|
|
const newExtraHops = extraHops + 1;
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
for (const possibleUrl of urls) {
|
|
|
|
const res = seed.isIncluded(possibleUrl, depth, newExtraHops);
|
|
|
|
|
|
|
|
if (!res) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const {url, isOOS} = res;
|
|
|
|
|
|
|
|
if (url) {
|
|
|
|
await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (e) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
console.error("Queuing Error: ", e);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
async queueUrl(seedId, url, depth, extraHops = 0) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (this.limitHit) {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
if (this.params.limit > 0 && (await this.crawlState.numRealSeen() >= this.params.limit)) {
|
2021-01-29 18:26:55 +00:00
|
|
|
this.limitHit = true;
|
2020-11-01 19:22:53 -08:00
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
if (await this.crawlState.has(url)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
await this.crawlState.add(url);
|
2022-01-15 09:03:09 -08:00
|
|
|
const urlData = {url, seedId, depth};
|
|
|
|
if (extraHops) {
|
|
|
|
urlData.extraHops = extraHops;
|
|
|
|
}
|
|
|
|
this.cluster.queue(urlData);
|
2020-11-01 19:22:53 -08:00
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
async initPages() {
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2021-04-29 14:34:56 -07:00
|
|
|
let createNew = false;
|
|
|
|
|
2021-04-30 22:05:04 -04:00
|
|
|
// create pages dir if doesn't exist and write pages.jsonl header
|
|
|
|
if (fs.existsSync(this.pagesDir) != true){
|
|
|
|
await fsp.mkdir(this.pagesDir);
|
2021-04-29 14:34:56 -07:00
|
|
|
createNew = true;
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
this.pagesFH = await fsp.open(this.pagesFile, "a");
|
|
|
|
|
|
|
|
if (createNew) {
|
2021-03-31 13:41:27 -04:00
|
|
|
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
|
2021-02-23 16:52:54 -05:00
|
|
|
if (this.params.text) {
|
2021-03-31 13:41:27 -04:00
|
|
|
header["hasText"] = true;
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.statusLog("Text Extraction: Enabled");
|
|
|
|
} else {
|
2021-03-31 13:41:27 -04:00
|
|
|
header["hasText"] = false;
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.statusLog("Text Extraction: Disabled");
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2021-03-31 13:41:27 -04:00
|
|
|
const header_formatted = JSON.stringify(header).concat("\n");
|
2021-04-29 14:34:56 -07:00
|
|
|
await this.pagesFH.writeFile(header_formatted);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
} catch(err) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
console.error("pages/pages.jsonl creation failed", err);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
async writePage({url, depth}, title, text) {
|
2021-02-04 00:28:32 -05:00
|
|
|
const id = uuidv4();
|
|
|
|
const row = {"id": id, "url": url, "title": title};
|
2021-02-23 16:52:54 -05:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (depth === 0) {
|
|
|
|
row.seed = true;
|
|
|
|
}
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
if (text !== null) {
|
|
|
|
row.text = text;
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const processedRow = JSON.stringify(row) + "\n";
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.pagesFH.writeFile(processedRow);
|
|
|
|
} catch (err) {
|
2021-02-04 00:28:32 -05:00
|
|
|
console.warn("pages/pages.jsonl append failed", err);
|
|
|
|
}
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
resolveAgent(urlParsed) {
|
|
|
|
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async isHTML(url) {
|
|
|
|
try {
|
|
|
|
const resp = await fetch(url, {
|
|
|
|
method: "HEAD",
|
|
|
|
headers: this.headers,
|
|
|
|
agent: this.resolveAgent
|
|
|
|
});
|
2022-03-14 11:11:53 -07:00
|
|
|
if (resp.status !== 200) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.debugLog(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
2021-03-13 16:48:31 -08:00
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
const contentType = resp.headers.get("Content-Type");
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// just load if no content-type
|
|
|
|
if (!contentType) {
|
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
const mime = contentType.split(";")[0];
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
if (HTML_TYPES.includes(mime)) {
|
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
return false;
|
|
|
|
} catch(e) {
|
|
|
|
// can't confirm not html, so try in browser
|
2020-10-31 13:16:37 -07:00
|
|
|
return true;
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async directFetchCapture(url) {
|
|
|
|
//console.log(`Direct capture: ${this.capturePrefix}${url}`);
|
|
|
|
const abort = new AbortController();
|
|
|
|
const signal = abort.signal;
|
2022-03-14 11:11:53 -07:00
|
|
|
const resp = await fetch(this.capturePrefix + url, {signal, headers: this.headers, redirect: "manual"});
|
2020-11-01 19:22:53 -08:00
|
|
|
abort.abort();
|
2022-03-14 11:11:53 -07:00
|
|
|
return resp.status === 200 && !resp.headers.get("set-cookie");
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2021-04-30 12:31:14 -07:00
|
|
|
async awaitPendingClear() {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.statusLog("Waiting to ensure pending data is written to WARCs...");
|
2021-04-30 12:31:14 -07:00
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
const redis = await initRedis("redis://localhost/0");
|
2021-04-30 12:31:14 -07:00
|
|
|
|
|
|
|
while (true) {
|
|
|
|
const res = await redis.get(`pywb:${this.params.collection}:pending`);
|
|
|
|
if (res === "0" || !res) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.debugLog(`Still waiting for ${res} pending requests to finish...`);
|
2021-04-30 12:31:14 -07:00
|
|
|
|
|
|
|
await this.sleep(1000);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
sleep(time) {
|
|
|
|
return new Promise(resolve => setTimeout(resolve, time));
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
async parseSitemap(url, seedId) {
|
2020-11-14 21:55:02 +00:00
|
|
|
const sitemapper = new Sitemapper({
|
|
|
|
url,
|
|
|
|
timeout: 15000,
|
|
|
|
requestHeaders: this.headers
|
|
|
|
});
|
|
|
|
|
|
|
|
try {
|
|
|
|
const { sites } = await sitemapper.fetch();
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
await this.queueInScopeUrls(seedId, sites, 0);
|
2020-11-14 21:55:02 +00:00
|
|
|
} catch(e) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
console.warn(e);
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
}
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
async combineWARC() {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.statusLog("Generating Combined WARCs");
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
// Get the list of created Warcs
|
2021-04-29 14:34:56 -07:00
|
|
|
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.debugLog(`Combining ${warcLists.length} WARCs...`);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
|
|
|
|
|
|
|
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
|
|
|
|
for (let i = 0; i < warcLists.length; i++) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const fileName = path.join(this.collDir, "archive", warcLists[i]);
|
2022-02-08 15:31:55 -08:00
|
|
|
const fileSize = await getFileSize(fileName);
|
2021-03-31 13:41:27 -04:00
|
|
|
fileSizeObjects.push({"fileSize": fileSize, "fileName": fileName});
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
const generatedCombinedWarcs = [];
|
|
|
|
|
|
|
|
// Used to name combined warcs, default to -1 for first increment
|
|
|
|
let combinedWarcNumber = -1;
|
|
|
|
|
|
|
|
// write combine WARC to collection root
|
|
|
|
let combinedWarcFullPath = "";
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// fileHandler
|
|
|
|
let fh = null;
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
// Iterate through the sorted file size array.
|
|
|
|
for (let j = 0; j < fileSizeObjects.length; j++) {
|
|
|
|
|
|
|
|
// if need to rollover to new warc
|
|
|
|
let doRollover = false;
|
|
|
|
|
|
|
|
// set to true for first warc
|
|
|
|
if (combinedWarcNumber < 0) {
|
|
|
|
doRollover = true;
|
|
|
|
} else {
|
|
|
|
// Check the size of the existing combined warc.
|
2022-02-08 15:31:55 -08:00
|
|
|
const currentCombinedWarcSize = await getFileSize(combinedWarcFullPath);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
// If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
|
|
|
|
const proposedWarcSize = fileSizeObjects[j].fileSize + currentCombinedWarcSize;
|
|
|
|
|
|
|
|
doRollover = (proposedWarcSize >= this.params.rolloverSize);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (doRollover) {
|
2021-06-23 19:36:32 -07:00
|
|
|
// If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
|
2021-03-31 13:41:27 -04:00
|
|
|
// 1. increment the combinedWarcNumber
|
|
|
|
// 2. create the name of the new combinedWarcFile
|
|
|
|
// 3. Write the header out to the new file
|
|
|
|
// 4. Write out the current warc data to the combinedFile
|
|
|
|
combinedWarcNumber = combinedWarcNumber + 1;
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
const combinedWarcName = `${this.params.collection}_${combinedWarcNumber}.warc.gz`;
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
// write combined warcs to root collection dir as they're output of a collection (like wacz)
|
|
|
|
combinedWarcFullPath = path.join(this.collDir, combinedWarcName);
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
if (fh) {
|
|
|
|
fh.end();
|
|
|
|
}
|
|
|
|
|
|
|
|
fh = fs.createWriteStream(combinedWarcFullPath, {flags: "a"});
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
generatedCombinedWarcs.push(combinedWarcName);
|
|
|
|
|
|
|
|
const warcBuffer = await this.createWARCInfo(combinedWarcName);
|
2021-04-29 14:34:56 -07:00
|
|
|
fh.write(warcBuffer);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.debugLog(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
|
|
|
|
|
|
|
const p = new Promise((resolve) => {
|
|
|
|
reader.on("end", () => resolve());
|
|
|
|
});
|
|
|
|
|
|
|
|
reader.pipe(fh, {end: false});
|
|
|
|
|
|
|
|
await p;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fh) {
|
|
|
|
await fh.end();
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
this.debugLog(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
async serializeConfig(done = false) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
switch (this.params.saveState) {
|
|
|
|
case "never":
|
|
|
|
return;
|
|
|
|
|
|
|
|
case "partial":
|
2022-03-14 10:41:56 -07:00
|
|
|
if (!done) {
|
|
|
|
return;
|
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (await this.crawlState.finished()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "always":
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const now = new Date();
|
|
|
|
|
|
|
|
if (!done) {
|
|
|
|
// if not done, save state only after specified interval has elapsed
|
|
|
|
if ((now.getTime() - this.lastSaveTime) < this.saveStateInterval) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
this.lastSaveTime = now.getTime();
|
|
|
|
|
|
|
|
const ts = now.toISOString().slice(0,19).replace(/[T:-]/g, "");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
const crawlDir = path.join(this.collDir, "crawls");
|
|
|
|
|
|
|
|
await fsp.mkdir(crawlDir, {recursive: true});
|
|
|
|
|
|
|
|
const filename = path.join(crawlDir, `crawl-${ts}-${this.params.crawlId}.yaml`);
|
|
|
|
|
|
|
|
const state = await this.crawlState.serialize();
|
|
|
|
|
|
|
|
if (this.origConfig) {
|
|
|
|
this.origConfig.state = state;
|
|
|
|
}
|
|
|
|
const res = yaml.dump(this.origConfig, {lineWidth: -1});
|
2022-03-14 10:41:56 -07:00
|
|
|
try {
|
|
|
|
this.statusLog("Saving crawl state to: " + filename);
|
|
|
|
await fsp.writeFile(filename, res);
|
|
|
|
} catch (e) {
|
|
|
|
console.error(`Failed to write save state file: ${filename}`, e);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.saveStateFiles.push(filename);
|
|
|
|
|
|
|
|
if (this.saveStateFiles.length > this.params.saveStateHistory) {
|
|
|
|
const oldFilename = this.saveStateFiles.shift();
|
|
|
|
this.statusLog(`Removing old save-state: ${oldFilename}`);
|
|
|
|
try {
|
|
|
|
await fsp.unlink(oldFilename);
|
|
|
|
} catch (e) {
|
|
|
|
console.error(`Failed to delete old save state file: ${oldFilename}`);
|
|
|
|
}
|
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
module.exports.Crawler = Crawler;
|