2022-10-24 15:30:10 +02:00
|
|
|
import child_process from "child_process";
|
|
|
|
import path from "path";
|
|
|
|
import fs from "fs";
|
|
|
|
import os from "os";
|
|
|
|
import fsp from "fs/promises";
|
2021-02-08 22:21:34 -08:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
import { RedisCrawlState, LoadState } from "./util/state.js";
|
2022-10-24 15:30:10 +02:00
|
|
|
import Sitemapper from "sitemapper";
|
|
|
|
import { v4 as uuidv4 } from "uuid";
|
|
|
|
import yaml from "js-yaml";
|
|
|
|
|
|
|
|
import * as warcio from "warcio";
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
import { HealthChecker } from "./util/healthcheck.js";
|
2022-10-24 15:30:10 +02:00
|
|
|
import { TextExtract } from "./util/textextract.js";
|
2023-07-06 00:58:28 -04:00
|
|
|
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization } from "./util/storage.js";
|
2022-10-24 15:30:10 +02:00
|
|
|
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
|
2022-12-21 12:06:13 -05:00
|
|
|
import { Screenshots } from "./util/screenshots.js";
|
2022-10-24 15:30:10 +02:00
|
|
|
import { parseArgs } from "./util/argParser.js";
|
|
|
|
import { initRedis } from "./util/redis.js";
|
2023-03-17 14:24:44 -07:00
|
|
|
import { logger, errJSON } from "./util/logger.js";
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
import { runWorkers } from "./util/worker.js";
|
2023-03-22 14:50:18 -04:00
|
|
|
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
2023-07-06 16:09:48 -04:00
|
|
|
import { collectAllFileSources } from "./util/file_reader.js";
|
2020-11-01 19:22:53 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
import { Browser } from "./util/browser.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-10-25 10:53:32 -04:00
|
|
|
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
2023-04-19 19:17:15 -07:00
|
|
|
import { OriginOverride } from "./util/originoverride.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
// to ignore HTTPS error for HEAD check
|
|
|
|
import { Agent as HTTPAgent } from "http";
|
|
|
|
import { Agent as HTTPSAgent } from "https";
|
|
|
|
|
|
|
|
const HTTPS_AGENT = HTTPSAgent({
|
|
|
|
rejectUnauthorized: false,
|
|
|
|
});
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
const HTTP_AGENT = HTTPAgent();
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-10 23:11:24 -05:00
|
|
|
const FETCH_TIMEOUT_SECS = 30;
|
|
|
|
const PAGE_OP_TIMEOUT_SECS = 5;
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-05-03 16:25:59 -07:00
|
|
|
const POST_CRAWL_STATES = ["generate-wacz", "uploading-wacz", "generate-cdx", "generate-warc"];
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// ============================================================================
|
2022-10-24 15:30:10 +02:00
|
|
|
export class Crawler {
|
2020-11-01 19:22:53 -08:00
|
|
|
constructor() {
|
2022-12-15 12:38:41 -05:00
|
|
|
const res = parseArgs();
|
|
|
|
this.params = res.parsed;
|
|
|
|
this.origConfig = res.origConfig;
|
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
// root collections dir
|
|
|
|
this.collDir = path.join(this.params.cwd, "collections", this.params.collection);
|
|
|
|
this.logDir = path.join(this.collDir, "logs");
|
|
|
|
this.logFilename = path.join(this.logDir, `crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`);
|
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
const debugLogging = this.params.logging.includes("debug");
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.setDebugLogging(debugLogging);
|
2023-04-01 13:07:59 -04:00
|
|
|
logger.setLogLevel(this.params.logLevel);
|
|
|
|
logger.setContext(this.params.context);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Writing log to: " + this.logFilename, {}, "init");
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2020-11-03 17:16:29 +00:00
|
|
|
this.headers = {};
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
this.crawlState = null;
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// pages file
|
|
|
|
this.pagesFH = null;
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
this.crawlId = process.env.CRAWL_ID || os.hostname();
|
|
|
|
|
|
|
|
this.startTime = Date.now();
|
|
|
|
|
2021-01-29 18:26:55 +00:00
|
|
|
// was the limit hit?
|
|
|
|
this.limitHit = false;
|
2023-04-03 11:10:47 -07:00
|
|
|
this.pageLimit = this.params.pageLimit;
|
|
|
|
|
|
|
|
// resolve maxPageLimit and ensure pageLimit is no greater than maxPageLimit
|
|
|
|
if (this.params.maxPageLimit) {
|
|
|
|
this.pageLimit = this.pageLimit ? Math.min(this.pageLimit, this.params.maxPageLimit) : this.params.maxPageLimit;
|
|
|
|
}
|
2021-01-29 18:26:55 +00:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
this.saveStateFiles = [];
|
|
|
|
this.lastSaveTime = 0;
|
2023-03-22 14:50:18 -04:00
|
|
|
|
|
|
|
// sum of page load + behavior timeouts + 2 x fetch + cloudflare + link extraction timeouts + extra page delay
|
|
|
|
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
|
|
|
|
this.maxPageTime = this.params.pageLoadTimeout + this.params.behaviorTimeout +
|
|
|
|
FETCH_TIMEOUT_SECS*2 + PAGE_OP_TIMEOUT_SECS*2 + this.params.pageExtraDelay;
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice = this.params.emulateDevice || {};
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
2022-08-11 18:44:39 -07:00
|
|
|
this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
|
2021-02-04 00:28:32 -05:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
this.gotoOpts = {
|
|
|
|
waitUntil: this.params.waitUntil,
|
2023-03-22 14:50:18 -04:00
|
|
|
timeout: this.params.pageLoadTimeout * 1000
|
2021-05-21 15:37:02 -07:00
|
|
|
};
|
2021-02-04 00:28:32 -05:00
|
|
|
|
|
|
|
// pages directory
|
|
|
|
this.pagesDir = path.join(this.collDir, "pages");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
// pages file
|
|
|
|
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
this.blockRules = null;
|
2022-10-25 10:53:32 -04:00
|
|
|
this.adBlockRules = null;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
this.healthChecker = null;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
this.interrupted = false;
|
|
|
|
this.finalExit = false;
|
|
|
|
this.clearOnExit = false;
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
|
|
|
this.done = false;
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
this.customBehaviors = "";
|
2022-07-08 17:17:46 -07:00
|
|
|
this.behaviorLastLine = null;
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
|
|
|
this.browser = new Browser();
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
}
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
configureUA() {
|
|
|
|
// override userAgent
|
|
|
|
if (this.params.userAgent) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice.userAgent = this.params.userAgent;
|
2020-11-14 19:32:31 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if device set, it overrides the default Chrome UA
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (!this.emulateDevice.userAgent) {
|
|
|
|
this.emulateDevice.userAgent = this.browser.getDefaultUA();
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// suffix to append to default userAgent
|
|
|
|
if (this.params.userAgentSuffix) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
|
|
|
return this.emulateDevice.userAgent;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
async initCrawlState() {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (!redisUrl.startsWith("redis://")) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
let redis;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
while (true) {
|
|
|
|
try {
|
|
|
|
redis = await initRedis(redisUrl);
|
|
|
|
break;
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
//logger.fatal("Unable to connect to state store Redis: " + redisUrl);
|
|
|
|
logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await sleep(3);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
|
|
|
|
|
|
|
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.maxPageTime, os.hostname());
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-03-30 21:29:41 -07:00
|
|
|
// clear any pending URLs from this instance
|
|
|
|
await this.crawlState.clearOwnPendingLocks();
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
|
2022-03-14 10:41:56 -07:00
|
|
|
}
|
|
|
|
|
2023-04-11 11:32:52 -04:00
|
|
|
if (this.params.logErrorsToRedis) {
|
|
|
|
logger.setLogErrorsToRedis(true);
|
|
|
|
logger.setCrawlState(this.crawlState);
|
|
|
|
}
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
return this.crawlState;
|
|
|
|
}
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
initScreenCaster() {
|
|
|
|
let transport;
|
|
|
|
|
|
|
|
if (this.params.screencastPort) {
|
|
|
|
transport = new WSTransport(this.params.screencastPort);
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast");
|
2022-02-23 12:09:48 -08:00
|
|
|
} else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!transport) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
return new ScreenCaster(transport, this.params.workers);
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
|
2022-08-11 18:44:39 -07:00
|
|
|
async bootstrap() {
|
2023-03-08 21:31:19 -05:00
|
|
|
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
|
|
|
|
|
|
|
|
if (initRes.status) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("wb-manager init failed, collection likely already exists");
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
fs.mkdirSync(this.logDir, {recursive: true});
|
|
|
|
this.logFH = fs.createWriteStream(this.logFilename);
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.setExternalLogStream(this.logFH);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
|
|
|
this.infoString = await this.getInfoString();
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(this.infoString);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Seeds", this.params.scopedSeeds);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
|
|
|
if (this.params.profile) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("With Browser Profile", {url: this.params.profile});
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
2023-02-03 00:02:47 -05:00
|
|
|
if (this.params.overwrite) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Clearing ${this.collDir} before starting`);
|
2023-02-03 00:02:47 -05:00
|
|
|
try {
|
|
|
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
|
|
|
} catch(e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Unable to clear ${this.collDir}`, e);
|
2023-02-03 00:02:47 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
if (this.params.customBehaviors) {
|
|
|
|
this.customBehaviors = this.loadCustomBehaviors(this.params.customBehaviors);
|
|
|
|
}
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
let opts = {};
|
2022-08-11 18:44:39 -07:00
|
|
|
let redisStdio;
|
|
|
|
|
2021-03-13 16:48:31 -08:00
|
|
|
if (this.params.logging.includes("pywb")) {
|
2023-02-24 18:31:08 -08:00
|
|
|
const pywbStderr = fs.openSync(path.join(this.logDir, "pywb.log"), "a");
|
2022-08-11 18:44:39 -07:00
|
|
|
const stdio = [process.stdin, pywbStderr, pywbStderr];
|
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
const redisStderr = fs.openSync(path.join(this.logDir, "redis.log"), "a");
|
2022-08-11 18:44:39 -07:00
|
|
|
redisStdio = [process.stdin, redisStderr, redisStderr];
|
|
|
|
|
|
|
|
opts = {stdio, cwd: this.params.cwd};
|
|
|
|
} else {
|
2021-03-04 15:36:58 -05:00
|
|
|
opts = {stdio: "ignore", cwd: this.params.cwd};
|
2022-08-11 18:44:39 -07:00
|
|
|
redisStdio = "ignore";
|
2021-03-04 15:36:58 -05:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.headers = {"User-Agent": this.configureUA()};
|
2020-11-03 17:16:29 +00:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
const subprocesses = [];
|
|
|
|
|
2022-08-11 18:44:39 -07:00
|
|
|
subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio}));
|
2021-05-21 15:37:02 -07:00
|
|
|
|
2023-07-06 18:54:35 +00:00
|
|
|
opts.env = {
|
|
|
|
...process.env,
|
|
|
|
COLL: this.params.collection,
|
|
|
|
ROLLOVER_SIZE: this.params.rolloverSize,
|
|
|
|
DEDUP_POLICY: this.params.dedupPolicy
|
|
|
|
};
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
subprocesses.push(child_process.spawn("uwsgi", [new URL("uwsgi.ini", import.meta.url).pathname], opts));
|
2021-05-21 15:37:02 -07:00
|
|
|
|
|
|
|
process.on("exit", () => {
|
|
|
|
for (const proc of subprocesses) {
|
|
|
|
proc.kill();
|
|
|
|
}
|
|
|
|
});
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2023-05-22 16:24:39 -07:00
|
|
|
child_process.spawn("socat", ["tcp-listen:9222,reuseaddr,fork", "tcp:localhost:9221"]);
|
2022-08-21 00:30:25 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (!this.params.headless && !process.env.NO_XVFB) {
|
2020-11-01 19:22:53 -08:00
|
|
|
child_process.spawn("Xvfb", [
|
|
|
|
process.env.DISPLAY,
|
|
|
|
"-listen",
|
|
|
|
"tcp",
|
|
|
|
"-screen",
|
|
|
|
"0",
|
|
|
|
process.env.GEOMETRY,
|
|
|
|
"-ac",
|
|
|
|
"+extension",
|
|
|
|
"RANDR"
|
|
|
|
]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-21 11:59:37 -08:00
|
|
|
extraChromeArgs() {
|
|
|
|
const args = [];
|
|
|
|
if (this.params.lang) {
|
|
|
|
args.push(`--accept-lang=${this.params.lang}`);
|
|
|
|
}
|
|
|
|
return args;
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async run() {
|
2022-08-11 18:44:39 -07:00
|
|
|
await this.bootstrap();
|
2021-08-17 20:54:18 -07:00
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
let status;
|
2022-09-20 17:09:52 -07:00
|
|
|
let exitCode = 0;
|
2020-10-31 13:16:37 -07:00
|
|
|
|
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
await this.crawl();
|
2022-09-20 17:09:52 -07:00
|
|
|
status = (!this.interrupted ? "done" : "interrupted");
|
2020-10-31 13:16:37 -07:00
|
|
|
} catch(e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error("Crawl failed", e);
|
2022-09-20 17:09:52 -07:00
|
|
|
exitCode = 9;
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
status = "failing";
|
|
|
|
if (await this.crawlState.incFailCount()) {
|
|
|
|
status = "failed";
|
|
|
|
}
|
|
|
|
|
|
|
|
} finally {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Crawl status: ${status}`);
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
2022-06-30 19:24:26 -07:00
|
|
|
if (this.crawlState) {
|
|
|
|
await this.crawlState.setStatus(status);
|
|
|
|
}
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
process.exit(exitCode);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2021-04-10 13:08:22 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
_behaviorLog({data, type}, pageUrl, workerid) {
|
2022-07-08 17:17:46 -07:00
|
|
|
let behaviorLine;
|
2023-02-23 18:50:22 -08:00
|
|
|
let message;
|
|
|
|
let details;
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const logDetails = {page: pageUrl, workerid};
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
if (typeof(data) === "string") {
|
|
|
|
message = data;
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
details = logDetails;
|
2023-02-23 18:50:22 -08:00
|
|
|
} else {
|
|
|
|
message = type === "info" ? "Behavior log" : "Behavior debug";
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
details = typeof(data) === "object" ? {...data, ...logDetails} : logDetails;
|
2023-02-23 18:50:22 -08:00
|
|
|
}
|
2022-07-08 17:17:46 -07:00
|
|
|
|
2021-04-10 13:08:22 -07:00
|
|
|
switch (type) {
|
|
|
|
case "info":
|
2022-07-08 17:17:46 -07:00
|
|
|
behaviorLine = JSON.stringify(data);
|
|
|
|
if (behaviorLine != this._behaviorLastLine) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(message, details, "behaviorScript");
|
2022-07-08 17:17:46 -07:00
|
|
|
this._behaviorLastLine = behaviorLine;
|
|
|
|
}
|
2021-04-10 13:08:22 -07:00
|
|
|
break;
|
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
case "error":
|
|
|
|
logger.error(message, details, "behaviorScript");
|
|
|
|
break;
|
|
|
|
|
2021-04-10 13:08:22 -07:00
|
|
|
case "debug":
|
|
|
|
default:
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(message, details, "behaviorScript");
|
2021-04-10 13:08:22 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
isInScope({seedId, url, depth, extraHops} = {}, logDetails = {}) {
|
2022-09-20 17:09:52 -07:00
|
|
|
const seed = this.params.scopedSeeds[seedId];
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
return seed.isIncluded(url, depth, extraHops, logDetails);
|
2022-09-20 17:09:52 -07:00
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
async setupPage({page, cdp, workerid}) {
|
2023-04-24 10:26:56 -07:00
|
|
|
await this.browser.setupPage({page, cdp});
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
if ((this.adBlockRules && this.params.blockAds) ||
|
|
|
|
this.blockRules || this.originOverride) {
|
|
|
|
|
|
|
|
await page.setRequestInterception(true);
|
|
|
|
|
|
|
|
if (this.adBlockRules && this.params.blockAds) {
|
|
|
|
await this.adBlockRules.initPage(this.browser, page);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.blockRules) {
|
|
|
|
await this.blockRules.initPage(this.browser, page);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.originOverride) {
|
|
|
|
await this.originOverride.initPage(this.browser, page);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.logging.includes("jserrors")) {
|
|
|
|
page.on("console", (msg) => {
|
|
|
|
if (msg.type() === "error") {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn(msg.text(), {"location": msg.location(), "page": page.url(), workerid}, "jsError");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
page.on("pageerror", (e) => {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Page Error", {...errJSON(e), "page": page.url(), workerid}, "jsError");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.screencaster) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Start Screencast", {workerid}, "screencast");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.screencaster.screencastPage(page, cdp, workerid);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.params.behaviorOpts) {
|
|
|
|
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid));
|
2023-07-06 16:09:48 -04:00
|
|
|
await this.browser.addInitScript(page, behaviors);
|
|
|
|
|
|
|
|
const initScript = `
|
|
|
|
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
|
|
|
|
${this.customBehaviors}
|
|
|
|
self.__bx_behaviors.selectMainBehavior();
|
|
|
|
`;
|
|
|
|
|
|
|
|
await this.browser.addInitScript(page, initScript);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
loadCustomBehaviors(filename) {
|
|
|
|
let str = "";
|
|
|
|
|
|
|
|
for (const source of collectAllFileSources(filename, ".js")) {
|
|
|
|
str += `self.__bx_behaviors.load(${source});\n`;
|
|
|
|
}
|
|
|
|
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
async crawlPage(opts) {
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.writeStats();
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const {page, cdp, data, workerid} = opts;
|
2023-02-23 18:50:22 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const {url} = data;
|
2023-03-13 18:07:59 -04:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const logDetails = {page: url, workerid};
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.logDetails = logDetails;
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
data.workerid = workerid;
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
if (!this.isInScope(data, logDetails)) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Page no longer in scope", data);
|
2023-03-13 14:48:04 -07:00
|
|
|
return true;
|
2022-09-20 17:09:52 -07:00
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
// run custom driver here
|
|
|
|
await this.driver({page, data, crawler: this});
|
2021-06-07 17:43:36 -07:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.title = await page.title();
|
2021-02-08 22:21:34 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.screenshot) {
|
|
|
|
if (!data.isHTMLPage) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Skipping screenshots for non-HTML page", logDetails);
|
2021-07-20 15:45:51 -07:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const archiveDir = path.join(this.collDir, "archive");
|
2023-04-26 15:41:35 -07:00
|
|
|
const screenshots = new Screenshots({browser: this.browser, page, url, directory: archiveDir});
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.screenshot.includes("view")) {
|
|
|
|
await screenshots.take();
|
2021-03-13 16:48:31 -08:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.screenshot.includes("fullPage")) {
|
|
|
|
await screenshots.takeFullPage();
|
|
|
|
}
|
|
|
|
if (this.params.screenshot.includes("thumbnail")) {
|
|
|
|
await screenshots.takeThumbnail();
|
|
|
|
}
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.text && data.isHTMLPage) {
|
|
|
|
const result = await cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.text = await new TextExtract(result).parseTextFromDom();
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
2022-12-21 12:06:13 -05:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.loadState = LoadState.EXTRACTION_DONE;
|
2022-12-21 12:06:13 -05:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.behaviorOpts) {
|
|
|
|
if (!data.isHTMLPage) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Skipping behaviors for non-HTML page", logDetails, "behavior");
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
} else if (data.skipBehaviors) {
|
|
|
|
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
} else {
|
|
|
|
const res = await timedRun(
|
2023-04-26 15:41:35 -07:00
|
|
|
this.runBehaviors(page, cdp, data.filteredFrames, logDetails),
|
2023-03-22 14:50:18 -04:00
|
|
|
this.params.behaviorTimeout,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
"Behaviors timed out",
|
|
|
|
logDetails,
|
|
|
|
"behavior"
|
|
|
|
);
|
2021-02-08 22:21:34 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (res && res.length) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.loadState = LoadState.BEHAVIORS_DONE;
|
2022-03-22 17:41:51 -07:00
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
if (this.params.pageExtraDelay) {
|
|
|
|
logger.info(`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, logDetails);
|
|
|
|
await sleep(this.params.pageExtraDelay);
|
|
|
|
}
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return true;
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
async pageFinished(data) {
|
|
|
|
await this.writePage(data);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
// if page loaded, considered page finished successfully
|
|
|
|
// (even if behaviors timed out)
|
|
|
|
const { loadState, logDetails } = data;
|
|
|
|
|
|
|
|
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
|
|
|
logger.info("Page Finished", {loadState, ...logDetails}, "pageStatus");
|
|
|
|
|
|
|
|
await this.crawlState.markFinished(data.url);
|
|
|
|
|
|
|
|
if (this.healthChecker) {
|
|
|
|
this.healthChecker.resetErrors();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
logger.warn("Page Load Failed", {loadState, ...logDetails}, "pageStatus");
|
|
|
|
|
|
|
|
await this.crawlState.markFailed(data.url);
|
|
|
|
|
|
|
|
if (this.healthChecker) {
|
|
|
|
this.healthChecker.incError();
|
|
|
|
}
|
|
|
|
}
|
2022-03-14 10:41:56 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.serializeConfig();
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
await this.checkLimits();
|
|
|
|
}
|
|
|
|
|
|
|
|
async teardownPage({workerid}) {
|
|
|
|
if (this.screencaster) {
|
|
|
|
await this.screencaster.stopById(workerid);
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2023-03-13 14:48:04 -07:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
async workerIdle(workerid) {
|
|
|
|
if (this.screencaster) {
|
|
|
|
//logger.debug("End Screencast", {workerid}, "screencast");
|
|
|
|
await this.screencaster.stopById(workerid, true);
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
async runBehaviors(page, cdp, frames, logDetails) {
|
2023-03-08 21:31:19 -05:00
|
|
|
try {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
frames = frames || page.frames();
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior");
|
2023-03-08 21:31:19 -05:00
|
|
|
|
|
|
|
return await Promise.allSettled(
|
2023-04-26 15:41:35 -07:00
|
|
|
frames.map(frame => this.browser.evaluateWithCLI(page, frame, cdp, "self.__bx_behaviors.run();", logDetails, "behavior"))
|
2023-03-08 21:31:19 -05:00
|
|
|
);
|
|
|
|
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior");
|
2023-03-08 21:31:19 -05:00
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
async shouldIncludeFrame(frame, logDetails) {
|
2023-01-23 16:47:33 -08:00
|
|
|
if (!frame.parentFrame()) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return frame;
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
const frameUrl = frame.url();
|
2023-01-23 16:47:33 -08:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
const frameElem = await frame.frameElement();
|
|
|
|
|
|
|
|
const tagName = await frame.evaluate(e => e.tagName, frameElem);
|
|
|
|
|
|
|
|
if (tagName !== "IFRAME" && tagName !== "FRAME") {
|
|
|
|
logger.debug("Skipping processing non-frame object", {tagName, frameUrl, ...logDetails}, "behavior");
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
let res;
|
|
|
|
|
|
|
|
if (frameUrl === "about:blank") {
|
|
|
|
res = false;
|
|
|
|
} else {
|
2023-03-08 21:31:19 -05:00
|
|
|
res = !this.adBlockRules.isAdUrl(frameUrl);
|
2023-02-23 18:50:22 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!res) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Skipping processing frame", {frameUrl, ...logDetails}, "behavior");
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return res ? frame : null;
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
async getInfoString() {
|
2021-04-29 14:34:56 -07:00
|
|
|
const packageFileJSON = JSON.parse(await fsp.readFile("../app/package.json"));
|
2021-07-07 18:56:52 -04:00
|
|
|
const warcioPackageJSON = JSON.parse(await fsp.readFile("/app/node_modules/warcio/package.json"));
|
2021-06-24 15:39:17 -07:00
|
|
|
const pywbVersion = child_process.execSync("pywb -V", {encoding: "utf8"}).trim().split(" ")[1];
|
2021-03-31 13:41:27 -04:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version} pywb ${pywbVersion})`;
|
|
|
|
}
|
|
|
|
|
|
|
|
async createWARCInfo(filename) {
|
|
|
|
const warcVersion = "WARC/1.0";
|
|
|
|
const type = "warcinfo";
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
const info = {
|
2023-03-08 21:31:19 -05:00
|
|
|
"software": this.infoString,
|
2021-07-07 18:56:52 -04:00
|
|
|
"format": "WARC File Format 1.0"
|
2021-03-31 13:41:27 -04:00
|
|
|
};
|
2022-12-21 12:06:13 -05:00
|
|
|
|
2021-07-07 18:56:52 -04:00
|
|
|
const warcInfo = {...info, ...this.params.warcInfo, };
|
|
|
|
const record = await warcio.WARCRecord.createWARCInfo({filename, type, warcVersion}, warcInfo);
|
2021-03-31 13:41:27 -04:00
|
|
|
const buffer = await warcio.WARCSerializer.serialize(record, {gzip: true});
|
|
|
|
return buffer;
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
async checkLimits() {
|
|
|
|
let interrupt = false;
|
|
|
|
|
2023-04-19 21:10:02 -04:00
|
|
|
const dir = path.join(this.collDir, "archive");
|
|
|
|
const size = await getDirSize(dir);
|
|
|
|
|
|
|
|
await this.crawlState.setArchiveSize(size);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2023-03-31 12:35:18 -04:00
|
|
|
if (this.params.sizeLimit) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (size >= this.params.sizeLimit) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
interrupt = true;
|
2022-09-20 17:09:52 -07:00
|
|
|
this.clearOnExit = true;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.params.timeLimit) {
|
2023-03-22 14:50:18 -04:00
|
|
|
const elapsed = secondsElapsed(this.startTime);
|
|
|
|
if (elapsed >= this.params.timeLimit) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
interrupt = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-31 12:35:18 -04:00
|
|
|
if (this.params.diskUtilization) {
|
2023-07-06 00:58:28 -04:00
|
|
|
// Check that disk usage isn't already or soon to be above threshold
|
|
|
|
const diskUtil = await checkDiskUtilization(this.params, size);
|
|
|
|
if (diskUtil.stop === true) {
|
2023-03-31 12:35:18 -04:00
|
|
|
interrupt = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (interrupt) {
|
2022-09-20 17:09:52 -07:00
|
|
|
this.gracefulFinish();
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
gracefulFinish() {
|
|
|
|
this.interrupted = true;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Crawler interrupted, gracefully finishing current pages");
|
2022-09-20 17:09:52 -07:00
|
|
|
if (!this.params.waitOnDone) {
|
|
|
|
this.finalExit = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
prepareForExit(markDone = true) {
|
|
|
|
if (!markDone) {
|
|
|
|
this.params.waitOnDone = false;
|
|
|
|
this.clearOnExit = true;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("SIGNAL: Preparing for exit of this crawler instance only");
|
2022-09-20 17:09:52 -07:00
|
|
|
} else {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("SIGNAL: Preparing for final exit of all crawlers");
|
2022-09-20 17:09:52 -07:00
|
|
|
this.finalExit = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async serializeAndExit() {
|
|
|
|
await this.serializeConfig();
|
|
|
|
process.exit(0);
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async crawl() {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (this.params.healthCheckPort) {
|
2023-03-08 21:31:19 -05:00
|
|
|
this.healthChecker = new HealthChecker(this.params.healthCheckPort, this.params.workers);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
2022-10-24 15:30:10 +02:00
|
|
|
const driverUrl = new URL(this.params.driver, import.meta.url);
|
|
|
|
this.driver = (await import(driverUrl)).default;
|
2020-11-01 19:22:53 -08:00
|
|
|
} catch(e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
2020-11-01 19:22:53 -08:00
|
|
|
return;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2022-06-30 19:24:26 -07:00
|
|
|
await this.initCrawlState();
|
|
|
|
|
|
|
|
let initState = await this.crawlState.getStatus();
|
|
|
|
|
|
|
|
while (initState === "debug") {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Paused for debugging, will continue after manual resume");
|
2022-06-30 19:24:26 -07:00
|
|
|
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(60);
|
2022-06-30 19:24:26 -07:00
|
|
|
|
|
|
|
initState = await this.crawlState.getStatus();
|
|
|
|
}
|
|
|
|
|
2022-09-08 23:39:26 -07:00
|
|
|
// if already done, don't crawl anymore
|
|
|
|
if (initState === "done") {
|
|
|
|
this.done = true;
|
|
|
|
|
|
|
|
if (this.params.waitOnDone) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Already done, waiting for signal to exit...");
|
2022-09-08 23:39:26 -07:00
|
|
|
|
|
|
|
// wait forever until signal
|
|
|
|
await new Promise(() => {});
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-05-05 14:27:17 -05:00
|
|
|
if (this.params.generateWACZ) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
this.storage = initStorage();
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
|
|
|
|
2023-05-03 16:25:59 -07:00
|
|
|
if (POST_CRAWL_STATES.includes(initState)) {
|
|
|
|
logger.info("crawl already finished, running post-crawl tasks", {state: initState});
|
|
|
|
await this.postCrawl();
|
|
|
|
return;
|
|
|
|
} else if (await this.crawlState.isCrawlStopped()) {
|
|
|
|
logger.info("crawl stopped, running post-crawl tasks");
|
2023-03-08 21:31:19 -05:00
|
|
|
await this.postCrawl();
|
|
|
|
return;
|
|
|
|
}
|
2022-06-30 19:24:26 -07:00
|
|
|
|
|
|
|
await this.crawlState.setStatus("running");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
if (this.params.state) {
|
|
|
|
await this.crawlState.load(this.params.state, this.params.scopedSeeds, true);
|
|
|
|
}
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
await this.initPages();
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2023-04-03 10:58:13 -07:00
|
|
|
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage);
|
2022-10-25 10:53:32 -04:00
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
if (this.params.blockRules && this.params.blockRules.length) {
|
2023-04-03 10:58:13 -07:00
|
|
|
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
this.screencaster = this.initScreenCaster();
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
if (this.params.originOverride.length) {
|
2023-04-19 19:17:15 -07:00
|
|
|
this.originOverride = new OriginOverride(this.params.originOverride);
|
|
|
|
}
|
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
|
|
|
const seed = this.params.scopedSeeds[i];
|
2022-01-15 09:03:09 -08:00
|
|
|
if (!await this.queueUrl(i, seed.url, 0, 0)) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (this.limitHit) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
if (seed.sitemap) {
|
|
|
|
await this.parseSitemap(seed.sitemap, i);
|
|
|
|
}
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.browser.launch({
|
2023-04-24 10:26:56 -07:00
|
|
|
profileUrl: this.params.profile,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
headless: this.params.headless,
|
|
|
|
emulateDevice: this.emulateDevice,
|
|
|
|
chromeOptions: {
|
|
|
|
proxy: !process.env.NO_PROXY,
|
|
|
|
userAgent: this.emulateDevice.userAgent,
|
|
|
|
extraArgs: this.extraChromeArgs()
|
|
|
|
}
|
2023-03-08 21:31:19 -05:00
|
|
|
});
|
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
// --------------
|
|
|
|
// Run Crawl Here!
|
|
|
|
await runWorkers(this, this.params.workers, this.maxPageTime);
|
|
|
|
// --------------
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.serializeConfig(true);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.browser.close();
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
if (this.pagesFH) {
|
2021-05-21 15:37:02 -07:00
|
|
|
await this.pagesFH.sync();
|
2021-04-29 14:34:56 -07:00
|
|
|
await this.pagesFH.close();
|
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.writeStats(true);
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// extra wait for all resources to land into WARCs
|
2021-04-30 12:31:14 -07:00
|
|
|
await this.awaitPendingClear();
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
await this.postCrawl();
|
|
|
|
}
|
|
|
|
|
|
|
|
async postCrawl() {
|
2021-03-31 13:41:27 -04:00
|
|
|
if (this.params.combineWARC) {
|
|
|
|
await this.combineWARC();
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2020-11-03 21:33:19 +00:00
|
|
|
if (this.params.generateCDX) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating CDX");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-cdx");
|
2023-05-07 16:00:56 -04:00
|
|
|
const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
|
|
|
if (indexResult === 0) {
|
|
|
|
logger.debug("Indexing complete, CDX successfully created");
|
|
|
|
} else {
|
|
|
|
logger.error("Error indexing and generating CDX", {"status code": indexResult});
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
await this.closeLog();
|
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) {
|
2022-02-08 15:31:55 -08:00
|
|
|
await this.generateWACZ();
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
if (this.clearOnExit) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Clearing ${this.collDir} before exit`);
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
try {
|
|
|
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
|
|
|
} catch(e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
this.done = true;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("All done, waiting for signal...");
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
await this.crawlState.setStatus("done");
|
|
|
|
|
|
|
|
// wait forever until signal
|
|
|
|
await new Promise(() => {});
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
}
|
2022-01-26 16:06:10 -08:00
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
async closeLog() {
|
|
|
|
// close file-based log
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.setExternalLogStream(null);
|
2023-02-24 18:31:08 -08:00
|
|
|
try {
|
|
|
|
await new Promise(resolve => this.logFH.close(() => resolve()));
|
|
|
|
} catch (e) {
|
|
|
|
// ignore
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
async generateWACZ() {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating WACZ");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-wacz");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
const archiveDir = path.join(this.collDir, "archive");
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// Get a list of the warcs inside
|
|
|
|
const warcFileList = await fsp.readdir(archiveDir);
|
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
// is finished (>0 pages and all pages written)
|
|
|
|
const isFinished = await this.crawlState.isFinished();
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Num WARC Files: ${warcFileList.length}`);
|
2022-02-08 15:31:55 -08:00
|
|
|
if (!warcFileList.length) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
// if finished, just return
|
|
|
|
if (isFinished) {
|
|
|
|
return;
|
|
|
|
}
|
2023-05-19 07:38:16 -07:00
|
|
|
// if stopped, won't get anymore data, so consider failed
|
|
|
|
if (await this.crawlState.isCrawlStopped()) {
|
|
|
|
await this.crawlState.setStatus("failed");
|
|
|
|
}
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.fatal("No WARC Files, assuming crawl failed");
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Build the argument list to pass to the wacz create command
|
|
|
|
const waczFilename = this.params.collection.concat(".wacz");
|
|
|
|
const waczPath = path.join(this.collDir, waczFilename);
|
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
const createArgs = [
|
|
|
|
"create",
|
|
|
|
"--split-seeds",
|
|
|
|
"-o", waczPath,
|
|
|
|
"--pages", this.pagesFile,
|
|
|
|
"--log-directory", this.logDir
|
|
|
|
];
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
if (process.env.WACZ_SIGN_URL) {
|
|
|
|
createArgs.push("--signing-url");
|
|
|
|
createArgs.push(process.env.WACZ_SIGN_URL);
|
|
|
|
if (process.env.WACZ_SIGN_TOKEN) {
|
|
|
|
createArgs.push("--signing-token");
|
|
|
|
createArgs.push(process.env.WACZ_SIGN_TOKEN);
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2022-02-08 15:31:55 -08:00
|
|
|
|
2023-04-04 10:46:03 -04:00
|
|
|
if (this.params.title) {
|
|
|
|
createArgs.push("--title");
|
|
|
|
createArgs.push(this.params.title);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this.params.description) {
|
|
|
|
createArgs.push("--desc");
|
|
|
|
createArgs.push(this.params.description);
|
|
|
|
}
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
createArgs.push("-f");
|
|
|
|
|
|
|
|
warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
|
|
|
|
|
|
|
// create WACZ
|
2022-12-15 12:38:41 -05:00
|
|
|
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs));
|
2022-02-08 15:31:55 -08:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (waczResult !== 0) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error("Error creating WACZ", {"status code": waczResult});
|
|
|
|
logger.fatal("Unable to write WACZ successfully");
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
// Verify WACZ
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
/*
|
|
|
|
const validateArgs = ["validate"];
|
|
|
|
validateArgs.push("-f");
|
2022-02-08 15:31:55 -08:00
|
|
|
validateArgs.push(waczPath);
|
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));
|
2022-02-08 15:31:55 -08:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (waczVerifyResult !== 0) {
|
2022-02-08 15:31:55 -08:00
|
|
|
console.log("validate", waczVerifyResult);
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.fatal("Unable to verify WACZ created successfully");
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
*/
|
2022-02-08 15:31:55 -08:00
|
|
|
if (this.storage) {
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("uploading-wacz");
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
const filename = process.env.STORE_FILENAME || "@ts-@id.wacz";
|
|
|
|
const targetFilename = interpolateFilename(filename, this.crawlId);
|
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
await this.storage.uploadCollWACZ(waczPath, targetFilename, isFinished);
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
awaitProcess(proc) {
|
2023-05-07 16:00:56 -04:00
|
|
|
let stdout = [];
|
|
|
|
let stderr = [];
|
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
proc.stdout.on("data", (data) => {
|
2023-05-07 16:00:56 -04:00
|
|
|
stdout.push(data.toString());
|
2022-12-15 12:38:41 -05:00
|
|
|
});
|
|
|
|
|
|
|
|
proc.stderr.on("data", (data) => {
|
2023-05-07 16:00:56 -04:00
|
|
|
stderr.push(data.toString());
|
2022-12-15 12:38:41 -05:00
|
|
|
});
|
2023-05-07 16:00:56 -04:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
return new Promise((resolve) => {
|
2023-05-07 16:00:56 -04:00
|
|
|
proc.on("close", (code) => {
|
|
|
|
if (stdout.length) {
|
|
|
|
logger.debug(stdout.join("\n"));
|
|
|
|
}
|
|
|
|
if (stderr.length && this.params.logging.includes("debug")) {
|
|
|
|
logger.error(stderr.join("\n"));
|
|
|
|
}
|
|
|
|
resolve(code);
|
|
|
|
});
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
logMemory() {
|
|
|
|
const memUsage = process.memoryUsage();
|
|
|
|
const { heapUsed, heapTotal } = memUsage;
|
|
|
|
this.maxHeapUsed = Math.max(this.maxHeapUsed || 0, heapUsed);
|
|
|
|
this.maxHeapTotal = Math.max(this.maxHeapTotal || 0, heapTotal);
|
|
|
|
logger.debug("Memory", {maxHeapUsed: this.maxHeapUsed, maxHeapTotal: this.maxHeapTotal, ...memUsage}, "memory");
|
|
|
|
}
|
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
async writeStats(toFile=false) {
|
|
|
|
if (!this.params.logging.includes("stats")) {
|
|
|
|
return;
|
|
|
|
}
|
2020-12-02 16:26:20 +00:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const realSize = await this.crawlState.queueSize();
|
2023-01-23 10:43:12 -08:00
|
|
|
const pendingList = await this.crawlState.getPendingList();
|
2022-12-15 12:38:41 -05:00
|
|
|
const done = await this.crawlState.numDone();
|
2023-04-13 16:31:33 -04:00
|
|
|
const failed = await this.crawlState.numFailed();
|
2023-01-23 10:43:12 -08:00
|
|
|
const total = realSize + pendingList.length + done;
|
2023-04-03 11:10:47 -07:00
|
|
|
const limit = {max: this.pageLimit || 0, hit: this.limitHit};
|
2022-12-15 12:38:41 -05:00
|
|
|
const stats = {
|
|
|
|
"crawled": done,
|
|
|
|
"total": total,
|
2023-01-23 10:43:12 -08:00
|
|
|
"pending": pendingList.length,
|
2023-04-13 16:31:33 -04:00
|
|
|
"failed": failed,
|
2022-12-15 12:38:41 -05:00
|
|
|
"limit": limit,
|
2023-01-23 10:43:12 -08:00
|
|
|
"pendingPages": pendingList.map(x => JSON.stringify(x))
|
2022-12-15 12:38:41 -05:00
|
|
|
};
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Crawl statistics", stats, "crawlStatus");
|
2023-04-26 15:41:35 -07:00
|
|
|
this.logMemory();
|
2022-12-15 12:38:41 -05:00
|
|
|
|
|
|
|
if (toFile && this.params.statsFilename) {
|
2020-12-02 16:26:20 +00:00
|
|
|
try {
|
2021-04-29 14:34:56 -07:00
|
|
|
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
2020-12-02 16:26:20 +00:00
|
|
|
} catch (err) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Stats output failed", err);
|
2020-12-02 16:26:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
async loadPage(page, data, selectorOptsList = DEFAULT_SELECTORS) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
const {url, seedId, depth, extraHops = 0} = data;
|
2023-02-23 18:50:22 -08:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
const logDetails = data.logDetails;
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2023-04-26 19:49:32 -04:00
|
|
|
const failCrawlOnError = ((depth === 0) && this.params.failOnFailedSeed);
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
let isHTMLPage = await timedRun(
|
2023-03-10 23:11:24 -05:00
|
|
|
this.isHTML(url),
|
|
|
|
FETCH_TIMEOUT_SECS,
|
|
|
|
"HEAD request to determine if URL is HTML page timed out",
|
|
|
|
logDetails
|
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
|
|
|
if (!isHTMLPage) {
|
2021-07-23 18:31:43 -07:00
|
|
|
try {
|
2023-03-13 14:48:04 -07:00
|
|
|
const captureResult = await timedRun(
|
2023-03-10 23:11:24 -05:00
|
|
|
this.directFetchCapture(url),
|
|
|
|
FETCH_TIMEOUT_SECS,
|
|
|
|
"Direct fetch capture attempt timed out",
|
|
|
|
logDetails
|
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (captureResult) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
|
2022-03-14 11:11:53 -07:00
|
|
|
return;
|
|
|
|
}
|
2021-07-23 18:31:43 -07:00
|
|
|
} catch (e) {
|
|
|
|
// ignore failed direct fetch attempt, do browser-based capture
|
|
|
|
}
|
2021-05-21 15:37:02 -07:00
|
|
|
}
|
|
|
|
|
2022-03-14 11:11:53 -07:00
|
|
|
let ignoreAbort = false;
|
|
|
|
|
|
|
|
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
|
|
|
|
// if so, don't report as an error
|
2022-03-22 17:41:51 -07:00
|
|
|
page.once("requestfailed", (req) => {
|
2022-03-14 14:41:39 -07:00
|
|
|
ignoreAbort = shouldIgnoreAbort(req);
|
2022-03-14 11:11:53 -07:00
|
|
|
});
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
if (isHTMLPage) {
|
|
|
|
page.once("domcontentloaded", () => {
|
|
|
|
data.loadState = LoadState.CONTENT_LOADED;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Awaiting page load", logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
try {
|
2023-03-08 21:31:19 -05:00
|
|
|
const resp = await page.goto(url, gotoOpts);
|
|
|
|
|
2023-04-26 19:49:32 -04:00
|
|
|
// Handle 4xx or 5xx response as a page load error
|
|
|
|
const statusCode = resp.status();
|
|
|
|
if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
|
|
|
|
if (failCrawlOnError) {
|
|
|
|
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
|
|
|
|
} else {
|
|
|
|
logger.error("Page Load Error, skipping page", {statusCode, ...logDetails});
|
|
|
|
throw new Error(`Page ${url} returned status code ${statusCode}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
const contentType = await this.browser.responseHeader(resp, "content-type");
|
2023-03-08 21:31:19 -05:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
isHTMLPage = this.isHTMLContentType(contentType);
|
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
} catch (e) {
|
2022-03-14 11:11:53 -07:00
|
|
|
let msg = e.message || "";
|
|
|
|
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
if (e.name === "TimeoutError") {
|
|
|
|
if (data.loadState !== LoadState.CONTENT_LOADED) {
|
2023-04-26 19:49:32 -04:00
|
|
|
if (failCrawlOnError) {
|
|
|
|
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
|
|
|
|
} else {
|
|
|
|
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
|
|
|
|
throw e;
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
} else {
|
|
|
|
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
|
|
|
|
data.skipBehaviors = true;
|
|
|
|
}
|
|
|
|
} else {
|
2023-04-26 19:49:32 -04:00
|
|
|
if (failCrawlOnError) {
|
|
|
|
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
|
|
|
|
} else {
|
|
|
|
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
|
|
|
|
throw e;
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2022-03-14 11:11:53 -07:00
|
|
|
}
|
2021-05-21 15:37:02 -07:00
|
|
|
}
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.loadState = LoadState.FULL_PAGE_LOADED;
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
data.isHTMLPage = isHTMLPage;
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
if (isHTMLPage) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
let frames = await page.frames();
|
|
|
|
frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)));
|
|
|
|
|
|
|
|
data.filteredFrames = frames.filter((x) => x.status === "fulfilled" && x.value).map(x => x.value);
|
|
|
|
|
|
|
|
//data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
|
2023-03-08 21:31:19 -05:00
|
|
|
} else {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
data.filteredFrames = [];
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
2022-03-22 17:41:51 -07:00
|
|
|
|
|
|
|
if (!isHTMLPage) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Skipping link extraction for non-HTML page", logDetails);
|
2022-03-22 17:41:51 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-07-20 15:45:51 -07:00
|
|
|
const seed = this.params.scopedSeeds[seedId];
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.checkCF(page, logDetails);
|
2022-03-18 10:32:59 -07:00
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.netIdle(page, logDetails);
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2021-07-20 15:45:51 -07:00
|
|
|
// skip extraction if at max depth
|
2021-07-23 18:31:43 -07:00
|
|
|
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
2021-07-20 15:45:51 -07:00
|
|
|
return;
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Extracting links");
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
for (const opts of selectorOptsList) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const links = await this.extractLinks(page, data.filteredFrames, opts, logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
await this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails);
|
2021-07-23 18:31:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
async netIdle(page, details) {
|
2022-07-08 17:17:46 -07:00
|
|
|
if (!this.params.netIdleWait) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// in case page starts loading via fetch/xhr immediately after page load,
|
|
|
|
// we want to ensure we don't exit too early
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(0.5);
|
2022-07-08 17:17:46 -07:00
|
|
|
|
|
|
|
try {
|
2023-04-26 15:41:35 -07:00
|
|
|
await this.browser.waitForNetworkIdle(page, {timeout: this.params.netIdleWait * 1000});
|
2022-07-08 17:17:46 -07:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("waitForNetworkIdle timed out, ignoring", details);
|
2022-07-08 17:17:46 -07:00
|
|
|
// ignore, continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
async extractLinks(page, frames, {selector = "a[href]", extract = "href", isAttribute = false} = {}, logDetails) {
|
2021-07-23 18:31:43 -07:00
|
|
|
const results = [];
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const loadProp = (options) => {
|
|
|
|
const { selector, extract } = options;
|
2021-07-23 18:31:43 -07:00
|
|
|
return [...document.querySelectorAll(selector)].map(elem => elem[extract]);
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
};
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const loadAttr = (options) => {
|
|
|
|
const { selector, extract } = options;
|
2021-07-23 18:31:43 -07:00
|
|
|
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(extract));
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
const loadFunc = isAttribute ? loadAttr : loadProp;
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
frames = frames || page.frames();
|
2023-03-08 21:31:19 -05:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
try {
|
2023-03-08 21:31:19 -05:00
|
|
|
const linkResults = await Promise.allSettled(
|
2023-03-13 14:48:04 -07:00
|
|
|
frames.map(frame => timedRun(
|
2023-04-26 15:41:35 -07:00
|
|
|
frame.evaluate(loadFunc, {selector, extract}),
|
2023-03-10 23:11:24 -05:00
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
"Link extraction timed out",
|
|
|
|
logDetails,
|
|
|
|
))
|
2023-03-08 21:31:19 -05:00
|
|
|
);
|
2021-07-20 15:45:51 -07:00
|
|
|
|
|
|
|
if (linkResults) {
|
2023-03-08 21:31:19 -05:00
|
|
|
let i = 0;
|
2021-07-20 15:45:51 -07:00
|
|
|
for (const linkResult of linkResults) {
|
2023-03-08 21:31:19 -05:00
|
|
|
if (!linkResult) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Link Extraction timed out in frame", {frameUrl: frames[i].url, ...logDetails});
|
2023-03-08 21:31:19 -05:00
|
|
|
continue;
|
|
|
|
}
|
2021-07-23 18:31:43 -07:00
|
|
|
if (!linkResult.value) continue;
|
2021-07-20 15:45:51 -07:00
|
|
|
for (const link of linkResult.value) {
|
|
|
|
results.push(link);
|
|
|
|
}
|
2023-03-08 21:31:19 -05:00
|
|
|
i++;
|
2021-07-20 15:45:51 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Link Extraction failed", e);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2021-07-23 18:31:43 -07:00
|
|
|
return results;
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
async queueInScopeUrls(seedId, urls, depth, extraHops = 0, logDetails = {}) {
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
depth += 1;
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
// new number of extra hops, set if this hop is out-of-scope (oos)
|
|
|
|
const newExtraHops = extraHops + 1;
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
for (const possibleUrl of urls) {
|
2023-03-08 21:31:19 -05:00
|
|
|
const res = this.isInScope({url: possibleUrl, extraHops: newExtraHops, depth, seedId}, logDetails);
|
2022-01-15 09:03:09 -08:00
|
|
|
|
|
|
|
if (!res) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const {url, isOOS} = res;
|
|
|
|
|
|
|
|
if (url) {
|
|
|
|
await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error("Queuing Error", e);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
async checkCF(page, logDetails) {
|
2022-03-18 10:32:59 -07:00
|
|
|
try {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Check CF Blocking", logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
while (await timedRun(
|
|
|
|
page.$("div.cf-browser-verification.cf-im-under-attack"),
|
|
|
|
PAGE_OP_TIMEOUT_SECS
|
|
|
|
)) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(5.5);
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
//logger.warn("Check CF failed, ignoring");
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
async queueUrl(seedId, url, depth, extraHops = 0) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Queuing url ${url}`);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (this.limitHit) {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-04-03 11:10:47 -07:00
|
|
|
if (this.pageLimit > 0 && (await this.crawlState.numSeen() >= this.pageLimit)) {
|
2021-01-29 18:26:55 +00:00
|
|
|
this.limitHit = true;
|
2020-11-01 19:22:53 -08:00
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
if (await this.crawlState.has(url)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
//await this.crawlState.add(url);
|
|
|
|
await this.crawlState.addToQueue({url, seedId, depth, extraHops});
|
2020-11-01 19:22:53 -08:00
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
async initPages() {
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2021-04-29 14:34:56 -07:00
|
|
|
let createNew = false;
|
|
|
|
|
2021-04-30 22:05:04 -04:00
|
|
|
// create pages dir if doesn't exist and write pages.jsonl header
|
|
|
|
if (fs.existsSync(this.pagesDir) != true){
|
|
|
|
await fsp.mkdir(this.pagesDir);
|
2021-04-29 14:34:56 -07:00
|
|
|
createNew = true;
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
this.pagesFH = await fsp.open(this.pagesFile, "a");
|
|
|
|
|
|
|
|
if (createNew) {
|
2021-03-31 13:41:27 -04:00
|
|
|
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
|
2021-02-23 16:52:54 -05:00
|
|
|
if (this.params.text) {
|
2021-03-31 13:41:27 -04:00
|
|
|
header["hasText"] = true;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Text Extraction: Enabled");
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
} else {
|
2021-03-31 13:41:27 -04:00
|
|
|
header["hasText"] = false;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Text Extraction: Disabled");
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2021-03-31 13:41:27 -04:00
|
|
|
const header_formatted = JSON.stringify(header).concat("\n");
|
2021-04-29 14:34:56 -07:00
|
|
|
await this.pagesFH.writeFile(header_formatted);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
} catch(err) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error("pages/pages.jsonl creation failed", err);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
async writePage({url, depth, title, text, loadState}) {
|
2021-02-04 00:28:32 -05:00
|
|
|
const id = uuidv4();
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
const row = {id, url, title, loadState};
|
2021-02-23 16:52:54 -05:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (depth === 0) {
|
|
|
|
row.seed = true;
|
|
|
|
}
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
if (text !== null) {
|
|
|
|
row.text = text;
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const processedRow = JSON.stringify(row) + "\n";
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.pagesFH.writeFile(processedRow);
|
|
|
|
} catch (err) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("pages/pages.jsonl append failed", err);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
resolveAgent(urlParsed) {
|
|
|
|
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async isHTML(url) {
|
|
|
|
try {
|
|
|
|
const resp = await fetch(url, {
|
|
|
|
method: "HEAD",
|
|
|
|
headers: this.headers,
|
|
|
|
agent: this.resolveAgent
|
|
|
|
});
|
2022-03-14 11:11:53 -07:00
|
|
|
if (resp.status !== 200) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
2021-03-13 16:48:31 -08:00
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
return this.isHTMLContentType(resp.headers.get("Content-Type"));
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
} catch(e) {
|
|
|
|
// can't confirm not html, so try in browser
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
logger.debug("HEAD request failed", {...e, url});
|
2023-03-08 21:31:19 -05:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
isHTMLContentType(contentType) {
|
|
|
|
// just load if no content-type
|
|
|
|
if (!contentType) {
|
|
|
|
return true;
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
const mime = contentType.split(";")[0];
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
if (HTML_TYPES.includes(mime)) {
|
2020-10-31 13:16:37 -07:00
|
|
|
return true;
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2023-03-08 21:31:19 -05:00
|
|
|
|
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async directFetchCapture(url) {
|
|
|
|
const abort = new AbortController();
|
|
|
|
const signal = abort.signal;
|
2022-03-14 11:11:53 -07:00
|
|
|
const resp = await fetch(this.capturePrefix + url, {signal, headers: this.headers, redirect: "manual"});
|
2020-11-01 19:22:53 -08:00
|
|
|
abort.abort();
|
2022-03-14 11:11:53 -07:00
|
|
|
return resp.status === 200 && !resp.headers.get("set-cookie");
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2021-04-30 12:31:14 -07:00
|
|
|
async awaitPendingClear() {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Waiting to ensure pending data is written to WARCs...");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("pending-wait");
|
2021-04-30 12:31:14 -07:00
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
const redis = await initRedis("redis://localhost/0");
|
2021-04-30 12:31:14 -07:00
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
while (!this.interrupted) {
|
2022-09-02 17:53:04 -07:00
|
|
|
try {
|
|
|
|
const count = Number(await redis.get(`pywb:${this.params.collection}:pending`) || 0);
|
|
|
|
if (count <= 0) {
|
|
|
|
break;
|
|
|
|
}
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Waiting for pending requests to finish", {numRequests: count});
|
2022-09-02 17:53:04 -07:00
|
|
|
} catch (e) {
|
2021-04-30 12:31:14 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(1);
|
2021-04-30 12:31:14 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
async parseSitemap(url, seedId) {
|
2020-11-14 21:55:02 +00:00
|
|
|
const sitemapper = new Sitemapper({
|
|
|
|
url,
|
|
|
|
timeout: 15000,
|
|
|
|
requestHeaders: this.headers
|
|
|
|
});
|
|
|
|
|
|
|
|
try {
|
|
|
|
const { sites } = await sitemapper.fetch();
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
await this.queueInScopeUrls(seedId, sites, 0);
|
2020-11-14 21:55:02 +00:00
|
|
|
} catch(e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Error fetching sites from sitemap", e);
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
}
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
async combineWARC() {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating Combined WARCs");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-warc");
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
// Get the list of created Warcs
|
2021-04-29 14:34:56 -07:00
|
|
|
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Combining ${warcLists.length} WARCs...`);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
|
|
|
|
|
|
|
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
|
|
|
|
for (let i = 0; i < warcLists.length; i++) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const fileName = path.join(this.collDir, "archive", warcLists[i]);
|
2022-02-08 15:31:55 -08:00
|
|
|
const fileSize = await getFileSize(fileName);
|
2021-03-31 13:41:27 -04:00
|
|
|
fileSizeObjects.push({"fileSize": fileSize, "fileName": fileName});
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
const generatedCombinedWarcs = [];
|
|
|
|
|
|
|
|
// Used to name combined warcs, default to -1 for first increment
|
|
|
|
let combinedWarcNumber = -1;
|
|
|
|
|
|
|
|
// write combine WARC to collection root
|
|
|
|
let combinedWarcFullPath = "";
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// fileHandler
|
|
|
|
let fh = null;
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
// Iterate through the sorted file size array.
|
|
|
|
for (let j = 0; j < fileSizeObjects.length; j++) {
|
|
|
|
|
|
|
|
// if need to rollover to new warc
|
|
|
|
let doRollover = false;
|
|
|
|
|
|
|
|
// set to true for first warc
|
|
|
|
if (combinedWarcNumber < 0) {
|
|
|
|
doRollover = true;
|
|
|
|
} else {
|
|
|
|
// Check the size of the existing combined warc.
|
2022-02-08 15:31:55 -08:00
|
|
|
const currentCombinedWarcSize = await getFileSize(combinedWarcFullPath);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
// If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
|
|
|
|
const proposedWarcSize = fileSizeObjects[j].fileSize + currentCombinedWarcSize;
|
|
|
|
|
|
|
|
doRollover = (proposedWarcSize >= this.params.rolloverSize);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (doRollover) {
|
2021-06-23 19:36:32 -07:00
|
|
|
// If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
|
2021-03-31 13:41:27 -04:00
|
|
|
// 1. increment the combinedWarcNumber
|
|
|
|
// 2. create the name of the new combinedWarcFile
|
|
|
|
// 3. Write the header out to the new file
|
|
|
|
// 4. Write out the current warc data to the combinedFile
|
|
|
|
combinedWarcNumber = combinedWarcNumber + 1;
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
const combinedWarcName = `${this.params.collection}_${combinedWarcNumber}.warc.gz`;
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
// write combined warcs to root collection dir as they're output of a collection (like wacz)
|
|
|
|
combinedWarcFullPath = path.join(this.collDir, combinedWarcName);
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
if (fh) {
|
|
|
|
fh.end();
|
|
|
|
}
|
|
|
|
|
|
|
|
fh = fs.createWriteStream(combinedWarcFullPath, {flags: "a"});
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
generatedCombinedWarcs.push(combinedWarcName);
|
|
|
|
|
|
|
|
const warcBuffer = await this.createWARCInfo(combinedWarcName);
|
2021-04-29 14:34:56 -07:00
|
|
|
fh.write(warcBuffer);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
|
|
|
|
|
|
|
const p = new Promise((resolve) => {
|
|
|
|
reader.on("end", () => resolve());
|
|
|
|
});
|
|
|
|
|
|
|
|
reader.pipe(fh, {end: false});
|
|
|
|
|
|
|
|
await p;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fh) {
|
|
|
|
await fh.end();
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
async serializeConfig(done = false) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
switch (this.params.saveState) {
|
|
|
|
case "never":
|
|
|
|
return;
|
|
|
|
|
|
|
|
case "partial":
|
2022-03-14 10:41:56 -07:00
|
|
|
if (!done) {
|
|
|
|
return;
|
|
|
|
}
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
if (await this.crawlState.isFinished()) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "always":
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const now = new Date();
|
|
|
|
|
|
|
|
if (!done) {
|
|
|
|
// if not done, save state only after specified interval has elapsed
|
2023-03-22 14:50:18 -04:00
|
|
|
if (secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval) {
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
this.lastSaveTime = now.getTime();
|
|
|
|
|
|
|
|
const ts = now.toISOString().slice(0,19).replace(/[T:-]/g, "");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
const crawlDir = path.join(this.collDir, "crawls");
|
|
|
|
|
|
|
|
await fsp.mkdir(crawlDir, {recursive: true});
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
const filenameOnly = `crawl-${ts}-${this.params.crawlId}.yaml`;
|
|
|
|
|
|
|
|
const filename = path.join(crawlDir, filenameOnly);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
const state = await this.crawlState.serialize();
|
|
|
|
|
|
|
|
if (this.origConfig) {
|
|
|
|
this.origConfig.state = state;
|
|
|
|
}
|
|
|
|
const res = yaml.dump(this.origConfig, {lineWidth: -1});
|
2022-03-14 10:41:56 -07:00
|
|
|
try {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Saving crawl state to: ${filename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
await fsp.writeFile(filename, res);
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Failed to write save state file: ${filename}`, e);
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.saveStateFiles.push(filename);
|
|
|
|
|
|
|
|
if (this.saveStateFiles.length > this.params.saveStateHistory) {
|
|
|
|
const oldFilename = this.saveStateFiles.shift();
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Removing old save-state: ${oldFilename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
try {
|
|
|
|
await fsp.unlink(oldFilename);
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Failed to delete old save state file: ${oldFilename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
}
|
|
|
|
}
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
|
|
|
if (this.storage && done && this.params.saveState === "always") {
|
|
|
|
const targetFilename = interpolateFilename(filenameOnly, this.crawlId);
|
|
|
|
|
|
|
|
await this.storage.uploadFile(filename, targetFilename);
|
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2022-03-14 14:41:39 -07:00
|
|
|
function shouldIgnoreAbort(req) {
|
|
|
|
try {
|
|
|
|
const failure = req.failure() && req.failure().errorText;
|
|
|
|
if (failure !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const resp = req.response();
|
|
|
|
const headers = resp && resp.headers();
|
|
|
|
|
|
|
|
if (!headers) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (headers["content-disposition"] ||
|
|
|
|
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} catch (e) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|