2023-11-09 11:27:11 -08:00
|
|
|
import child_process, { ChildProcess, StdioOptions } from "child_process";
|
2022-10-24 15:30:10 +02:00
|
|
|
import path from "path";
|
2023-11-09 11:27:11 -08:00
|
|
|
import fs, { WriteStream } from "fs";
|
2022-10-24 15:30:10 +02:00
|
|
|
import os from "os";
|
2024-04-11 13:55:52 -07:00
|
|
|
import fsp from "fs/promises";
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
import {
|
|
|
|
|
RedisCrawlState,
|
|
|
|
|
LoadState,
|
|
|
|
|
QueueState,
|
|
|
|
|
PageState,
|
|
|
|
|
WorkerId,
|
|
|
|
|
} from "./util/state.js";
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2024-09-05 18:10:27 -07:00
|
|
|
import { CrawlerArgs, parseArgs } from "./util/argParser.js";
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
import yaml from "js-yaml";
|
|
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
import { WACZ, WACZInitOpts, mergeCDXJ } from "./util/wacz.js";
|
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
import { HealthChecker } from "./util/healthcheck.js";
|
2023-10-31 23:05:30 -07:00
|
|
|
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
2023-11-09 19:11:11 -05:00
|
|
|
import {
|
|
|
|
|
initStorage,
|
|
|
|
|
getFileSize,
|
|
|
|
|
getDirSize,
|
|
|
|
|
interpolateFilename,
|
|
|
|
|
checkDiskUtilization,
|
|
|
|
|
S3StorageSync,
|
2025-02-06 17:54:51 -08:00
|
|
|
isDiskFull,
|
2023-11-09 19:11:11 -05:00
|
|
|
} from "./util/storage.js";
|
2023-11-09 11:27:11 -08:00
|
|
|
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
2022-12-21 12:06:13 -05:00
|
|
|
import { Screenshots } from "./util/screenshots.js";
|
2022-10-24 15:30:10 +02:00
|
|
|
import { initRedis } from "./util/redis.js";
|
2025-04-03 18:46:10 -04:00
|
|
|
import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js";
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
|
2023-03-22 14:50:18 -04:00
|
|
|
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
2024-11-04 23:30:53 -05:00
|
|
|
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
|
2020-11-01 19:22:53 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
import { Browser } from "./util/browser.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
import {
|
2024-06-25 13:53:43 -07:00
|
|
|
DISPLAY,
|
2024-11-08 08:04:41 -08:00
|
|
|
ExtractSelector,
|
2024-09-05 13:28:49 -07:00
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
2025-02-06 17:54:51 -08:00
|
|
|
ExitCodes,
|
2025-02-10 23:00:55 +01:00
|
|
|
InterruptReason,
|
2025-03-31 12:02:25 -07:00
|
|
|
BxFunctionBindings,
|
2023-11-09 19:11:11 -05:00
|
|
|
} from "./util/constants.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2024-09-05 18:10:27 -07:00
|
|
|
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
2023-04-19 19:17:15 -07:00
|
|
|
import { OriginOverride } from "./util/originoverride.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
import {
|
|
|
|
|
CDPSession,
|
|
|
|
|
Frame,
|
|
|
|
|
HTTPRequest,
|
|
|
|
|
HTTPResponse,
|
|
|
|
|
Page,
|
|
|
|
|
Protocol,
|
|
|
|
|
} from "puppeteer-core";
|
2024-03-22 17:32:42 -07:00
|
|
|
import { Recorder } from "./util/recorder.js";
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
import { SitemapReader } from "./util/sitemapper.js";
|
2025-07-03 10:49:37 -04:00
|
|
|
import { ScopedSeed, parseSeeds } from "./util/seeds.js";
|
2025-11-25 07:58:30 -08:00
|
|
|
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
2024-06-26 09:16:24 -07:00
|
|
|
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
|
2024-06-10 13:11:00 -07:00
|
|
|
import { initProxy } from "./util/proxy.js";
|
2025-04-09 12:24:29 +02:00
|
|
|
import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2025-03-31 12:02:25 -07:00
|
|
|
const btrixBehaviors = fs.readFileSync(
|
2023-11-09 19:11:11 -05:00
|
|
|
new URL(
|
|
|
|
|
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
|
|
|
|
import.meta.url,
|
|
|
|
|
),
|
|
|
|
|
{ encoding: "utf8" },
|
|
|
|
|
);
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2024-03-21 08:16:59 -07:00
|
|
|
const RUN_DETACHED = process.env.DETACHED_CHILD_PROC == "1";
|
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const POST_CRAWL_STATES = [
|
|
|
|
|
"generate-wacz",
|
|
|
|
|
"uploading-wacz",
|
|
|
|
|
"generate-cdx",
|
|
|
|
|
"generate-warc",
|
|
|
|
|
];
|
2023-05-03 16:25:59 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
type PageEntry = {
|
|
|
|
|
id: string;
|
|
|
|
|
url: string;
|
|
|
|
|
title?: string;
|
|
|
|
|
loadState?: number;
|
|
|
|
|
mime?: string;
|
|
|
|
|
seed?: boolean;
|
|
|
|
|
text?: string;
|
|
|
|
|
favIconUrl?: string;
|
2024-02-09 19:44:17 -05:00
|
|
|
ts?: string;
|
2024-02-28 22:56:12 -08:00
|
|
|
status?: number;
|
2024-09-27 10:01:20 -04:00
|
|
|
depth?: number;
|
2023-11-09 11:27:11 -08:00
|
|
|
};
|
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// ============================================================================
|
2022-10-24 15:30:10 +02:00
|
|
|
export class Crawler {
|
2024-09-05 18:10:27 -07:00
|
|
|
params: CrawlerArgs;
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
origConfig: any;
|
|
|
|
|
|
|
|
|
|
collDir: string;
|
|
|
|
|
logDir: string;
|
|
|
|
|
logFilename: string;
|
|
|
|
|
|
|
|
|
|
headers: Record<string, string> = {};
|
|
|
|
|
|
|
|
|
|
crawlState!: RedisCrawlState;
|
|
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
pagesFH?: WriteStream | null = null;
|
|
|
|
|
extraPagesFH?: WriteStream | null = null;
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
logFH: WriteStream | null = null;
|
2023-11-09 11:27:11 -08:00
|
|
|
|
|
|
|
|
crawlId: string;
|
|
|
|
|
|
|
|
|
|
startTime: number;
|
|
|
|
|
|
|
|
|
|
limitHit = false;
|
|
|
|
|
pageLimit: number;
|
|
|
|
|
|
|
|
|
|
saveStateFiles: string[] = [];
|
|
|
|
|
lastSaveTime: number;
|
|
|
|
|
|
|
|
|
|
maxPageTime: number;
|
|
|
|
|
|
2025-07-03 10:49:37 -04:00
|
|
|
seeds: ScopedSeed[] = [];
|
2024-06-13 17:18:06 -07:00
|
|
|
numOriginalSeeds = 0;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
emulateDevice: any = {};
|
|
|
|
|
|
|
|
|
|
captureBasePrefix = "";
|
|
|
|
|
|
|
|
|
|
infoString!: string;
|
|
|
|
|
|
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
gotoOpts: Record<string, any>;
|
|
|
|
|
|
|
|
|
|
pagesDir: string;
|
2024-04-11 13:55:52 -07:00
|
|
|
seedPagesFile: string;
|
|
|
|
|
otherPagesFile: string;
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
archivesDir: string;
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
warcCdxDir: string;
|
|
|
|
|
indexesDir: string;
|
2024-03-26 14:54:27 -07:00
|
|
|
|
|
|
|
|
screenshotWriter: WARCWriter | null;
|
|
|
|
|
textWriter: WARCWriter | null;
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
blockRules: BlockRules | null;
|
|
|
|
|
adBlockRules: AdBlockRules | null;
|
|
|
|
|
|
|
|
|
|
healthChecker: HealthChecker | null = null;
|
|
|
|
|
originOverride: OriginOverride | null = null;
|
|
|
|
|
|
|
|
|
|
screencaster: ScreenCaster | null = null;
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
skipTextDocs = 0;
|
|
|
|
|
|
2025-02-10 23:00:55 +01:00
|
|
|
interruptReason: InterruptReason | null = null;
|
2023-11-09 11:27:11 -08:00
|
|
|
finalExit = false;
|
|
|
|
|
uploadAndDeleteLocal = false;
|
|
|
|
|
done = false;
|
2024-10-31 14:06:17 -07:00
|
|
|
postCrawling = false;
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
textInPages = false;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
customBehaviors = "";
|
2023-12-13 12:14:53 -08:00
|
|
|
behaviorsChecked = false;
|
2023-11-09 11:27:11 -08:00
|
|
|
|
|
|
|
|
browser: Browser;
|
|
|
|
|
storage: S3StorageSync | null = null;
|
|
|
|
|
|
|
|
|
|
maxHeapUsed = 0;
|
|
|
|
|
maxHeapTotal = 0;
|
|
|
|
|
|
2024-06-10 13:11:00 -07:00
|
|
|
proxyServer?: string;
|
2025-08-20 16:07:29 -07:00
|
|
|
proxyPacUrl?: string;
|
2024-06-10 13:11:00 -07:00
|
|
|
|
2024-11-08 08:04:41 -08:00
|
|
|
driver:
|
|
|
|
|
| ((opts: {
|
|
|
|
|
page: Page;
|
|
|
|
|
data: PageState;
|
2025-01-28 11:28:23 -08:00
|
|
|
seed: ScopedSeed;
|
2024-11-08 08:04:41 -08:00
|
|
|
// eslint-disable-next-line no-use-before-define
|
|
|
|
|
crawler: Crawler;
|
|
|
|
|
}) => Promise<void>)
|
|
|
|
|
| null = null;
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
recording: boolean;
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
constructor() {
|
2024-03-22 17:32:42 -07:00
|
|
|
const args = this.parseArgs();
|
2024-09-05 18:10:27 -07:00
|
|
|
this.params = args as CrawlerArgs;
|
|
|
|
|
this.origConfig = this.params.origConfig;
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2025-04-01 13:40:03 -07:00
|
|
|
this.crawlId = this.params.crawlId;
|
|
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
// root collections dir
|
2023-11-09 11:27:11 -08:00
|
|
|
this.collDir = path.join(
|
|
|
|
|
this.params.cwd,
|
|
|
|
|
"collections",
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.collection,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-02-24 18:31:08 -08:00
|
|
|
this.logDir = path.join(this.collDir, "logs");
|
2023-11-09 11:27:11 -08:00
|
|
|
this.logFilename = path.join(
|
|
|
|
|
this.logDir,
|
2025-04-01 13:40:03 -07:00
|
|
|
`${interpolateFilename("@ts", "")}.log`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-02-24 18:31:08 -08:00
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
const debugLogging = this.params.logging.includes("debug");
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.setDebugLogging(debugLogging);
|
2023-04-01 13:07:59 -04:00
|
|
|
logger.setLogLevel(this.params.logLevel);
|
2024-03-07 08:35:53 -08:00
|
|
|
logger.setContext(this.params.logContext);
|
|
|
|
|
logger.setExcludeContext(this.params.logExcludeContext);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-11-14 21:54:40 -08:00
|
|
|
logger.debug("Writing log to: " + this.logFilename, {}, "general");
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
this.recording = !this.params.dryRun;
|
|
|
|
|
if (this.params.dryRun) {
|
|
|
|
|
logger.warn(
|
|
|
|
|
"Dry run mode: no archived data stored, only pages and logging. Storage and archive creation related options will be ignored.",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-03 17:16:29 +00:00
|
|
|
this.headers = {};
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// pages file
|
|
|
|
|
this.pagesFH = null;
|
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
this.startTime = Date.now();
|
|
|
|
|
|
2021-01-29 18:26:55 +00:00
|
|
|
// was the limit hit?
|
|
|
|
|
this.limitHit = false;
|
2023-04-03 11:10:47 -07:00
|
|
|
this.pageLimit = this.params.pageLimit;
|
|
|
|
|
|
|
|
|
|
// resolve maxPageLimit and ensure pageLimit is no greater than maxPageLimit
|
|
|
|
|
if (this.params.maxPageLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.pageLimit = this.pageLimit
|
|
|
|
|
? Math.min(this.pageLimit, this.params.maxPageLimit)
|
|
|
|
|
: this.params.maxPageLimit;
|
2023-04-03 11:10:47 -07:00
|
|
|
}
|
2021-01-29 18:26:55 +00:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
this.saveStateFiles = [];
|
|
|
|
|
this.lastSaveTime = 0;
|
2023-03-22 14:50:18 -04:00
|
|
|
|
2024-09-05 13:28:49 -07:00
|
|
|
// sum of page load + behavior timeouts + 2 x pageop timeouts (for cloudflare, link extraction) + extra page delay
|
2023-03-22 14:50:18 -04:00
|
|
|
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
|
2023-11-09 11:27:11 -08:00
|
|
|
this.maxPageTime =
|
|
|
|
|
this.params.pageLoadTimeout +
|
|
|
|
|
this.params.behaviorTimeout +
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS * 2 +
|
|
|
|
|
this.params.pageExtraDelay;
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice = this.params.emulateDevice || {};
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
//this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
|
|
|
|
//this.capturePrefix = "";//process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
|
2023-11-09 11:27:11 -08:00
|
|
|
//this.captureBasePrefix = "";
|
2021-02-04 00:28:32 -05:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
this.gotoOpts = {
|
|
|
|
|
waitUntil: this.params.waitUntil,
|
2023-11-09 11:27:11 -08:00
|
|
|
timeout: this.params.pageLoadTimeout * 1000,
|
2021-05-21 15:37:02 -07:00
|
|
|
};
|
2021-02-04 00:28:32 -05:00
|
|
|
|
|
|
|
|
// pages directory
|
|
|
|
|
this.pagesDir = path.join(this.collDir, "pages");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
// pages file
|
2024-04-11 13:55:52 -07:00
|
|
|
this.seedPagesFile = path.join(this.pagesDir, "pages.jsonl");
|
|
|
|
|
this.otherPagesFile = path.join(this.pagesDir, "extraPages.jsonl");
|
2021-07-19 15:49:43 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
// archives dir
|
|
|
|
|
this.archivesDir = path.join(this.collDir, "archive");
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
|
|
|
|
|
// indexes dirs
|
|
|
|
|
this.warcCdxDir = path.join(this.collDir, "warc-cdx");
|
|
|
|
|
this.indexesDir = path.join(this.collDir, "indexes");
|
2024-03-26 14:54:27 -07:00
|
|
|
|
|
|
|
|
this.screenshotWriter = null;
|
|
|
|
|
this.textWriter = null;
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
this.blockRules = null;
|
2022-10-25 10:53:32 -04:00
|
|
|
this.adBlockRules = null;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
this.healthChecker = null;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2025-02-10 23:00:55 +01:00
|
|
|
this.interruptReason = null;
|
2022-09-20 17:09:52 -07:00
|
|
|
this.finalExit = false;
|
2023-08-01 00:04:10 -07:00
|
|
|
this.uploadAndDeleteLocal = false;
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
this.textInPages = this.params.text.includes("to-pages");
|
|
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
this.done = false;
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
this.customBehaviors = "";
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
|
|
|
|
this.browser = new Browser();
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
protected parseArgs() {
|
|
|
|
|
return parseArgs();
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
configureUA() {
|
|
|
|
|
// override userAgent
|
|
|
|
|
if (this.params.userAgent) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice.userAgent = this.params.userAgent;
|
2023-10-25 18:32:10 +02:00
|
|
|
return this.params.userAgent;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if device set, it overrides the default Chrome UA
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (!this.emulateDevice.userAgent) {
|
|
|
|
|
this.emulateDevice.userAgent = this.browser.getDefaultUA();
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// suffix to append to default userAgent
|
|
|
|
|
if (this.params.userAgentSuffix) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
|
|
|
|
return this.emulateDevice.userAgent;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
async initCrawlState() {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (!redisUrl.startsWith("redis://")) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.fatal(
|
2023-11-09 19:11:11 -05:00
|
|
|
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
let redis;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
while (true) {
|
|
|
|
|
try {
|
|
|
|
|
redis = await initRedis(redisUrl);
|
|
|
|
|
break;
|
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
//logger.fatal("Unable to connect to state store Redis: " + redisUrl);
|
|
|
|
|
logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
|
2023-09-06 11:14:18 -04:00
|
|
|
await sleep(1);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
|
|
|
|
|
{},
|
2023-11-09 19:11:11 -05:00
|
|
|
"state",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
this.crawlState = new RedisCrawlState(
|
|
|
|
|
redis,
|
2025-01-29 18:15:28 -08:00
|
|
|
this.crawlId,
|
2023-11-09 11:27:11 -08:00
|
|
|
this.maxPageTime,
|
2023-11-09 19:11:11 -05:00
|
|
|
os.hostname(),
|
2025-02-06 18:48:40 -08:00
|
|
|
this.params.maxPageRetries,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2025-04-09 12:24:29 +02:00
|
|
|
if (this.params.logErrorsToRedis) {
|
|
|
|
|
logger.setLogErrorsToRedis(true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.logBehaviorsToRedis) {
|
|
|
|
|
logger.setLogBehaviorsToRedis(true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.logErrorsToRedis || this.params.logBehaviorsToRedis) {
|
|
|
|
|
logger.setCrawlState(this.crawlState);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if automatically restarts on error exit code,
|
|
|
|
|
// exit with 0 from fatal by default, to avoid unnecessary restart
|
|
|
|
|
// otherwise, exit with default fatal exit code
|
|
|
|
|
if (this.params.restartsOnError) {
|
|
|
|
|
logger.setDefaultFatalExitCode(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return this.crawlState;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async loadCrawlState() {
|
2024-03-15 20:54:43 -04:00
|
|
|
// load full state from config
|
|
|
|
|
if (this.params.state) {
|
2024-06-13 17:18:06 -07:00
|
|
|
await this.crawlState.load(this.params.state, this.seeds, true);
|
2024-03-15 20:54:43 -04:00
|
|
|
// otherwise, just load extra seeds
|
|
|
|
|
} else {
|
|
|
|
|
await this.loadExtraSeeds();
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-30 21:29:41 -07:00
|
|
|
// clear any pending URLs from this instance
|
|
|
|
|
await this.crawlState.clearOwnPendingLocks();
|
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`,
|
|
|
|
|
{},
|
2023-11-09 19:11:11 -05:00
|
|
|
"state",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-03-14 10:41:56 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
async loadExtraSeeds() {
|
|
|
|
|
const extraSeeds = await this.crawlState.getExtraSeeds();
|
|
|
|
|
|
|
|
|
|
for (const { origSeedId, newUrl } of extraSeeds) {
|
2024-06-13 17:18:06 -07:00
|
|
|
const seed = this.seeds[origSeedId];
|
|
|
|
|
this.seeds.push(seed.newScopedSeed(newUrl));
|
2024-03-15 20:54:43 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
initScreenCaster() {
|
|
|
|
|
let transport;
|
|
|
|
|
|
|
|
|
|
if (this.params.screencastPort) {
|
|
|
|
|
transport = new WSTransport(this.params.screencastPort);
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
`Screencast server started on: ${this.params.screencastPort}`,
|
|
|
|
|
{},
|
2023-11-09 19:11:11 -05:00
|
|
|
"screencast",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
// } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
|
|
|
|
// transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
|
|
|
|
|
// logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
|
|
|
|
|
// }
|
2022-02-23 12:09:48 -08:00
|
|
|
|
|
|
|
|
if (!transport) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-01 21:26:12 -04:00
|
|
|
return new ScreenCaster(
|
|
|
|
|
transport,
|
|
|
|
|
this.params.workers,
|
|
|
|
|
this.browser.screenWHRatio,
|
|
|
|
|
);
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
launchRedis() {
|
2023-11-09 11:27:11 -08:00
|
|
|
let redisStdio: StdioOptions;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
|
if (this.params.logging.includes("redis")) {
|
|
|
|
|
const redisStderr = fs.openSync(path.join(this.logDir, "redis.log"), "a");
|
|
|
|
|
redisStdio = [process.stdin, redisStderr, redisStderr];
|
|
|
|
|
} else {
|
|
|
|
|
redisStdio = "ignore";
|
|
|
|
|
}
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
let redisArgs: string[] = [];
|
2023-11-07 21:38:50 -08:00
|
|
|
if (this.params.debugAccessRedis) {
|
|
|
|
|
redisArgs = ["--protected-mode", "no"];
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
return child_process.spawn("redis-server", redisArgs, {
|
2025-05-28 12:48:06 -07:00
|
|
|
cwd: os.tmpdir(),
|
2023-11-09 11:27:11 -08:00
|
|
|
stdio: redisStdio,
|
2024-03-21 08:16:59 -07:00
|
|
|
detached: RUN_DETACHED,
|
2023-11-09 11:27:11 -08:00
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async bootstrap() {
|
2025-02-06 17:54:51 -08:00
|
|
|
if (await isDiskFull(this.params.cwd)) {
|
2025-11-25 07:58:30 -08:00
|
|
|
await logger.interrupt(
|
2025-02-06 17:54:51 -08:00
|
|
|
"Out of disk space, exiting",
|
|
|
|
|
{},
|
|
|
|
|
ExitCodes.OutOfSpace,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const subprocesses: ChildProcess[] = [];
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-07-23 18:50:26 -07:00
|
|
|
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
redisUrl.startsWith("redis://localhost:") ||
|
|
|
|
|
redisUrl.startsWith("redis://127.0.0.1:")
|
|
|
|
|
) {
|
|
|
|
|
subprocesses.push(this.launchRedis());
|
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2025-04-09 21:37:46 +02:00
|
|
|
await this.initCrawlState();
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
await fsp.mkdir(this.logDir, { recursive: true });
|
2024-06-07 10:34:19 -07:00
|
|
|
|
|
|
|
|
if (!this.params.dryRun) {
|
|
|
|
|
await fsp.mkdir(this.archivesDir, { recursive: true });
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
await fsp.mkdir(this.warcCdxDir, { recursive: true });
|
2024-06-07 10:34:19 -07:00
|
|
|
}
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2025-11-25 07:58:30 -08:00
|
|
|
logger.openLog(this.logFilename);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2024-03-18 14:24:48 -07:00
|
|
|
this.infoString = await getInfoString();
|
2024-05-22 15:47:05 -07:00
|
|
|
setWARCInfo(this.infoString, this.params.warcInfo);
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(this.infoString);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2025-08-20 16:07:29 -07:00
|
|
|
const res = await initProxy(this.params, RUN_DETACHED);
|
|
|
|
|
this.proxyServer = res.proxyServer;
|
|
|
|
|
this.proxyPacUrl = res.proxyPacUrl;
|
2024-08-28 18:47:24 -07:00
|
|
|
|
2025-07-03 10:49:37 -04:00
|
|
|
this.seeds = await parseSeeds(this.params);
|
|
|
|
|
this.numOriginalSeeds = this.seeds.length;
|
|
|
|
|
|
2024-06-13 17:18:06 -07:00
|
|
|
logger.info("Seeds", this.seeds);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2024-11-08 08:04:41 -08:00
|
|
|
logger.info("Link Selectors", this.params.selectLinks);
|
|
|
|
|
|
2024-08-27 16:20:19 -04:00
|
|
|
if (this.params.behaviorOpts) {
|
|
|
|
|
logger.info("Behavior Options", this.params.behaviorOpts);
|
|
|
|
|
} else {
|
|
|
|
|
logger.info("Behaviors disabled");
|
|
|
|
|
}
|
2024-07-23 18:50:26 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
if (this.params.profile) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info("With Browser Profile", { url: this.params.profile });
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
|
2023-02-03 00:02:47 -05:00
|
|
|
if (this.params.overwrite) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Clearing ${this.collDir} before starting`);
|
2023-02-03 00:02:47 -05:00
|
|
|
try {
|
|
|
|
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Unable to clear ${this.collDir}`, e);
|
2023-02-03 00:02:47 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
if (this.params.customBehaviors) {
|
2024-11-04 23:30:53 -05:00
|
|
|
this.customBehaviors = await this.loadCustomBehaviors(
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.customBehaviors,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
this.headers = { "User-Agent": this.configureUA() };
|
2020-11-03 17:16:29 +00:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
process.on("exit", () => {
|
|
|
|
|
for (const proc of subprocesses) {
|
|
|
|
|
proc.kill();
|
|
|
|
|
}
|
|
|
|
|
});
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2024-06-20 20:10:25 -07:00
|
|
|
if (this.params.debugAccessBrowser) {
|
|
|
|
|
child_process.spawn(
|
|
|
|
|
"socat",
|
|
|
|
|
["tcp-listen:9222,reuseaddr,fork", "tcp:localhost:9221"],
|
|
|
|
|
{ detached: RUN_DETACHED },
|
|
|
|
|
);
|
|
|
|
|
}
|
2022-08-21 00:30:25 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (!this.params.headless && !process.env.NO_XVFB) {
|
2024-03-21 08:16:59 -07:00
|
|
|
child_process.spawn(
|
|
|
|
|
"Xvfb",
|
|
|
|
|
[
|
2024-06-25 13:53:43 -07:00
|
|
|
DISPLAY,
|
2024-03-21 08:16:59 -07:00
|
|
|
"-listen",
|
|
|
|
|
"tcp",
|
|
|
|
|
"-screen",
|
|
|
|
|
"0",
|
|
|
|
|
process.env.GEOMETRY || "",
|
|
|
|
|
"-ac",
|
|
|
|
|
"+extension",
|
|
|
|
|
"RANDR",
|
|
|
|
|
],
|
|
|
|
|
{ detached: RUN_DETACHED },
|
|
|
|
|
);
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2024-03-26 14:54:27 -07:00
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
if (this.params.screenshot && !this.params.dryRun) {
|
2024-03-26 14:54:27 -07:00
|
|
|
this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots");
|
|
|
|
|
}
|
2024-06-07 10:34:19 -07:00
|
|
|
if (this.params.text && !this.params.dryRun) {
|
2024-03-26 14:54:27 -07:00
|
|
|
this.textWriter = this.createExtraResourceWarcWriter("text");
|
|
|
|
|
}
|
2025-04-09 12:24:29 +02:00
|
|
|
|
|
|
|
|
await this.loadCrawlState();
|
2025-04-29 18:18:04 -07:00
|
|
|
|
|
|
|
|
await this.crawlState.trimToLimit(this.pageLimit);
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
|
|
|
|
|
2022-11-21 11:59:37 -08:00
|
|
|
extraChromeArgs() {
|
2025-11-11 21:03:30 +01:00
|
|
|
const args: string[] = [];
|
2022-11-21 11:59:37 -08:00
|
|
|
if (this.params.lang) {
|
2025-05-12 16:06:29 -07:00
|
|
|
if (this.params.profile) {
|
|
|
|
|
logger.warn(
|
|
|
|
|
"Ignoring --lang option with profile, using language configured in the profile",
|
|
|
|
|
{ lang: this.params.lang },
|
|
|
|
|
);
|
|
|
|
|
} else {
|
|
|
|
|
args.push(`--accept-lang=${this.params.lang}`);
|
|
|
|
|
}
|
2022-11-21 11:59:37 -08:00
|
|
|
}
|
2025-11-11 21:03:30 +01:00
|
|
|
|
|
|
|
|
const extra = this.params.extraChromeArgs;
|
|
|
|
|
if (Array.isArray(extra) && extra.length > 0) {
|
|
|
|
|
for (const v of extra) {
|
|
|
|
|
if (v) {
|
|
|
|
|
args.push(String(v));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-21 11:59:37 -08:00
|
|
|
return args;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async run() {
|
2022-08-11 18:44:39 -07:00
|
|
|
await this.bootstrap();
|
2021-08-17 20:54:18 -07:00
|
|
|
|
2023-09-06 11:14:18 -04:00
|
|
|
let status = "done";
|
2025-02-06 17:54:51 -08:00
|
|
|
let exitCode = ExitCodes.Success;
|
2020-10-31 13:16:37 -07:00
|
|
|
|
|
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
await this.crawl();
|
2023-09-06 11:14:18 -04:00
|
|
|
const finished = await this.crawlState.isFinished();
|
2023-09-13 10:48:21 -07:00
|
|
|
const stopped = await this.crawlState.isCrawlStopped();
|
2023-10-09 12:28:58 -07:00
|
|
|
const canceled = await this.crawlState.isCrawlCanceled();
|
2023-09-13 22:54:55 -07:00
|
|
|
if (!finished) {
|
2023-10-09 12:28:58 -07:00
|
|
|
if (canceled) {
|
|
|
|
|
status = "canceled";
|
|
|
|
|
} else if (stopped) {
|
2023-09-13 22:54:55 -07:00
|
|
|
status = "done";
|
|
|
|
|
logger.info("Crawl gracefully stopped on request");
|
2025-02-10 23:00:55 +01:00
|
|
|
} else if (this.interruptReason) {
|
2023-09-13 22:54:55 -07:00
|
|
|
status = "interrupted";
|
2025-02-10 23:00:55 +01:00
|
|
|
switch (this.interruptReason) {
|
|
|
|
|
case InterruptReason.SizeLimit:
|
|
|
|
|
exitCode = ExitCodes.SizeLimit;
|
|
|
|
|
break;
|
|
|
|
|
case InterruptReason.BrowserCrashed:
|
|
|
|
|
exitCode = ExitCodes.BrowserCrashed;
|
|
|
|
|
break;
|
|
|
|
|
case InterruptReason.SignalInterrupted:
|
|
|
|
|
exitCode = ExitCodes.SignalInterrupted;
|
|
|
|
|
break;
|
|
|
|
|
case InterruptReason.DiskUtilization:
|
|
|
|
|
exitCode = ExitCodes.DiskUtilization;
|
|
|
|
|
break;
|
|
|
|
|
case InterruptReason.FailedLimit:
|
|
|
|
|
exitCode = ExitCodes.FailedLimit;
|
|
|
|
|
break;
|
|
|
|
|
case InterruptReason.TimeLimit:
|
|
|
|
|
exitCode = ExitCodes.TimeLimit;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2023-09-13 22:54:55 -07:00
|
|
|
}
|
2023-09-06 11:14:18 -04:00
|
|
|
}
|
2025-07-08 13:08:52 -07:00
|
|
|
if (await this.crawlState.isFailed()) {
|
|
|
|
|
logger.error("Crawl failed, no pages crawled successfully");
|
|
|
|
|
status = "failed";
|
|
|
|
|
exitCode = ExitCodes.Failed;
|
|
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error("Crawl failed", e);
|
2025-02-06 17:54:51 -08:00
|
|
|
exitCode = ExitCodes.Failed;
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
status = "failing";
|
|
|
|
|
if (await this.crawlState.incFailCount()) {
|
|
|
|
|
status = "failed";
|
|
|
|
|
}
|
|
|
|
|
} finally {
|
2023-10-09 12:28:58 -07:00
|
|
|
await this.setStatusAndExit(exitCode, status);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2021-04-10 13:08:22 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
_behaviorLog(
|
|
|
|
|
{ data, type }: { data: string; type: string },
|
|
|
|
|
pageUrl: string,
|
2023-11-09 19:11:11 -05:00
|
|
|
workerid: WorkerId,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2023-02-23 18:50:22 -08:00
|
|
|
let message;
|
|
|
|
|
let details;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const logDetails = { page: pageUrl, workerid };
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
2025-04-03 18:46:10 -04:00
|
|
|
let context: LogContext = "behaviorScript";
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (typeof data === "string") {
|
2023-02-23 18:50:22 -08:00
|
|
|
message = data;
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
details = logDetails;
|
2023-02-23 18:50:22 -08:00
|
|
|
} else {
|
2025-04-03 18:46:10 -04:00
|
|
|
switch (type) {
|
|
|
|
|
case "error":
|
|
|
|
|
message = "Behavior error";
|
|
|
|
|
break;
|
|
|
|
|
case "debug":
|
|
|
|
|
message = "Behavior debug";
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
message = "Behavior log";
|
|
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
details =
|
|
|
|
|
typeof data === "object"
|
|
|
|
|
? { ...(data as object), ...logDetails }
|
|
|
|
|
: logDetails;
|
2025-04-03 18:46:10 -04:00
|
|
|
|
|
|
|
|
if (typeof data === "object") {
|
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
const objData = data as any;
|
|
|
|
|
if (objData.siteSpecific) {
|
|
|
|
|
context = "behaviorScriptCustom";
|
|
|
|
|
delete objData.siteSpecific;
|
|
|
|
|
}
|
|
|
|
|
message = objData.msg || message;
|
|
|
|
|
delete objData.msg;
|
|
|
|
|
details = { ...objData, ...logDetails };
|
|
|
|
|
} else {
|
|
|
|
|
details = logDetails;
|
|
|
|
|
}
|
2023-02-23 18:50:22 -08:00
|
|
|
}
|
2022-07-08 17:17:46 -07:00
|
|
|
|
2021-04-10 13:08:22 -07:00
|
|
|
switch (type) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case "info":
|
2025-07-30 16:20:14 -07:00
|
|
|
logger.info(message, details, context);
|
2023-11-09 19:11:11 -05:00
|
|
|
break;
|
2021-04-10 13:08:22 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case "error":
|
2025-04-03 18:46:10 -04:00
|
|
|
logger.error(message, details, context);
|
2023-11-09 19:11:11 -05:00
|
|
|
break;
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case "debug":
|
|
|
|
|
default:
|
2025-04-03 18:46:10 -04:00
|
|
|
logger.debug(message, details, context);
|
2021-04-10 13:08:22 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 16:11:48 -07:00
|
|
|
protected getScope(
|
2023-11-09 11:27:11 -08:00
|
|
|
{
|
|
|
|
|
seedId,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
2024-07-11 19:48:43 -07:00
|
|
|
noOOS,
|
|
|
|
|
}: {
|
|
|
|
|
seedId: number;
|
|
|
|
|
url: string;
|
|
|
|
|
depth: number;
|
|
|
|
|
extraHops: number;
|
|
|
|
|
noOOS: boolean;
|
|
|
|
|
},
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails = {},
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2024-07-11 19:48:43 -07:00
|
|
|
return this.seeds[seedId].isIncluded(
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
|
|
|
|
logDetails,
|
|
|
|
|
noOOS,
|
|
|
|
|
);
|
2024-06-18 16:11:48 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async isInScope(
|
|
|
|
|
{
|
|
|
|
|
seedId,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
|
|
|
|
}: { seedId: number; url: string; depth: number; extraHops: number },
|
|
|
|
|
logDetails = {},
|
|
|
|
|
): Promise<boolean> {
|
2024-06-13 17:18:06 -07:00
|
|
|
const seed = await this.crawlState.getSeedAt(
|
|
|
|
|
this.seeds,
|
|
|
|
|
this.numOriginalSeeds,
|
|
|
|
|
seedId,
|
|
|
|
|
);
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2024-06-18 16:11:48 -07:00
|
|
|
return !!seed.isIncluded(url, depth, extraHops, logDetails);
|
2022-09-20 17:09:52 -07:00
|
|
|
}
|
|
|
|
|
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
async setupPage(opts: WorkerState) {
|
|
|
|
|
const { page, cdp, workerid, callbacks, frameIdToExecId, recorder } = opts;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
await this.browser.setupPage({ page, cdp });
|
|
|
|
|
|
2024-03-27 09:26:51 -07:00
|
|
|
await this.setupExecContextEvents(cdp, frameIdToExecId);
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (
|
|
|
|
|
(this.adBlockRules && this.params.blockAds) ||
|
|
|
|
|
this.blockRules ||
|
|
|
|
|
this.originOverride
|
|
|
|
|
) {
|
2023-04-26 15:41:35 -07:00
|
|
|
await page.setRequestInterception(true);
|
|
|
|
|
|
|
|
|
|
if (this.adBlockRules && this.params.blockAds) {
|
|
|
|
|
await this.adBlockRules.initPage(this.browser, page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.blockRules) {
|
|
|
|
|
await this.blockRules.initPage(this.browser, page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.originOverride) {
|
|
|
|
|
await this.originOverride.initPage(this.browser, page);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.logging.includes("jserrors")) {
|
|
|
|
|
page.on("console", (msg) => {
|
|
|
|
|
if (msg.type() === "error") {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
msg.text(),
|
|
|
|
|
{ location: msg.location(), page: page.url(), workerid },
|
2023-11-09 19:11:11 -05:00
|
|
|
"jsError",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
page.on("pageerror", (e) => {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Page Error",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ ...formatErr(e), page: page.url(), workerid },
|
2023-11-09 19:11:11 -05:00
|
|
|
"jsError",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.screencaster) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug("Start Screencast", { workerid }, "screencast");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.screencaster.screencastPage(page, cdp, workerid);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
await page.exposeFunction(
|
2025-03-31 12:02:25 -07:00
|
|
|
BxFunctionBindings.AddLinkFunc,
|
2023-11-09 19:11:11 -05:00
|
|
|
(url: string) => callbacks.addLink && callbacks.addLink(url),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-15 10:12:08 -07:00
|
|
|
|
2025-03-31 12:02:25 -07:00
|
|
|
// used for both behaviors and link extraction now
|
|
|
|
|
await this.browser.addInitScript(page, btrixBehaviors);
|
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.behaviorOpts) {
|
2023-11-09 11:27:11 -08:00
|
|
|
await page.exposeFunction(
|
2025-03-31 12:02:25 -07:00
|
|
|
BxFunctionBindings.BehaviorLogFunc,
|
2023-11-09 11:27:11 -08:00
|
|
|
(logdata: { data: string; type: string }) =>
|
2023-11-09 19:11:11 -05:00
|
|
|
this._behaviorLog(logdata, page.url(), workerid),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
|
|
|
|
|
const initScript = `
|
|
|
|
|
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
|
|
|
|
|
${this.customBehaviors}
|
|
|
|
|
self.__bx_behaviors.selectMainBehavior();
|
|
|
|
|
`;
|
2023-12-13 12:14:53 -08:00
|
|
|
if (!this.behaviorsChecked && this.customBehaviors) {
|
|
|
|
|
await this.checkBehaviorScripts(cdp);
|
|
|
|
|
this.behaviorsChecked = true;
|
|
|
|
|
}
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2025-03-31 12:02:25 -07:00
|
|
|
await page.exposeFunction(BxFunctionBindings.FetchFunc, (url: string) => {
|
|
|
|
|
return recorder ? recorder.addExternalFetch(url, cdp) : false;
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
});
|
|
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
await this.browser.addInitScript(page, initScript);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
|
2025-10-08 14:57:52 -07:00
|
|
|
// Handle JS dialogs:
|
|
|
|
|
// - Ensure off-page navigation is canceled while behavior is running
|
|
|
|
|
// - dismiss close all other dialogs if not blocking unload
|
|
|
|
|
page.on("dialog", async (dialog) => {
|
|
|
|
|
let accepted = true;
|
|
|
|
|
if (dialog.type() === "beforeunload") {
|
|
|
|
|
if (opts.pageBlockUnload) {
|
|
|
|
|
accepted = false;
|
|
|
|
|
await dialog.dismiss();
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
} else {
|
|
|
|
|
await dialog.accept();
|
|
|
|
|
}
|
2025-10-08 14:57:52 -07:00
|
|
|
} else {
|
|
|
|
|
// other JS dialog, just dismiss
|
|
|
|
|
await dialog.dismiss();
|
|
|
|
|
}
|
|
|
|
|
logger.debug("JS Dialog", {
|
|
|
|
|
accepted,
|
|
|
|
|
blockingUnload: opts.pageBlockUnload,
|
|
|
|
|
message: dialog.message(),
|
|
|
|
|
type: dialog.type(),
|
|
|
|
|
page: page.url(),
|
|
|
|
|
workerid,
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
});
|
2025-10-08 14:57:52 -07:00
|
|
|
});
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
|
2025-10-08 14:57:52 -07:00
|
|
|
// only add if running with autoclick behavior
|
|
|
|
|
if (this.params.behaviors.includes("autoclick")) {
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
// Close any windows opened during navigation from autoclick
|
|
|
|
|
await cdp.send("Target.setDiscoverTargets", { discover: true });
|
|
|
|
|
|
|
|
|
|
cdp.on("Target.targetCreated", async (params) => {
|
|
|
|
|
const { targetInfo } = params;
|
|
|
|
|
const { type, openerFrameId, targetId } = targetInfo;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
if (
|
|
|
|
|
type === "page" &&
|
|
|
|
|
openerFrameId &&
|
|
|
|
|
opts.frameIdToExecId.has(openerFrameId)
|
|
|
|
|
) {
|
|
|
|
|
await cdp.send("Target.closeTarget", { targetId });
|
|
|
|
|
} else {
|
|
|
|
|
logger.warn("Extra target not closed", { targetInfo });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await cdp.send("Runtime.runIfWaitingForDebugger");
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// target likely already closed
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
void cdp.send("Target.setAutoAttach", {
|
|
|
|
|
autoAttach: true,
|
|
|
|
|
waitForDebuggerOnStart: true,
|
|
|
|
|
flatten: false,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if (this.recording) {
|
|
|
|
|
await cdp.send("Page.enable");
|
|
|
|
|
|
|
|
|
|
cdp.on("Page.windowOpen", async (params) => {
|
|
|
|
|
const { seedId, depth, extraHops = 0, url } = opts.data;
|
|
|
|
|
|
|
|
|
|
const logDetails = { page: url, workerid };
|
|
|
|
|
|
|
|
|
|
await this.queueInScopeUrls(
|
|
|
|
|
seedId,
|
|
|
|
|
[params.url],
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
|
|
|
|
false,
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-31 12:02:25 -07:00
|
|
|
await page.exposeFunction(BxFunctionBindings.AddToSeenSet, (data: string) =>
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
this.crawlState.addToUserSet(data),
|
|
|
|
|
);
|
|
|
|
|
|
2025-04-09 12:24:29 +02:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
await page.exposeFunction(BxFunctionBindings.InitFlow, (params: any) => {
|
|
|
|
|
return initFlow(params, recorder, cdp, this.crawlState, workerid);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
await page.exposeFunction(BxFunctionBindings.NextFlowStep, (id: string) => {
|
|
|
|
|
return nextFlowStep(id, page, workerid);
|
|
|
|
|
});
|
|
|
|
|
|
2025-07-08 13:08:52 -07:00
|
|
|
if (this.params.failOnContentCheck) {
|
|
|
|
|
await page.exposeFunction(
|
|
|
|
|
BxFunctionBindings.ContentCheckFailed,
|
|
|
|
|
(reason: string) => {
|
|
|
|
|
// if called outside of awaitPageLoad(), ignore
|
|
|
|
|
if (!opts.data.contentCheckAllowed) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
void this.crawlState.setFailReason(reason);
|
|
|
|
|
logger.fatal(
|
|
|
|
|
"Content check failed, failing crawl",
|
|
|
|
|
{ reason },
|
|
|
|
|
"behavior",
|
|
|
|
|
ExitCodes.Failed,
|
|
|
|
|
);
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
|
|
|
|
|
2024-03-27 09:26:51 -07:00
|
|
|
async setupExecContextEvents(
|
|
|
|
|
cdp: CDPSession,
|
|
|
|
|
frameIdToExecId: Map<string, number>,
|
|
|
|
|
) {
|
|
|
|
|
await cdp.send("Runtime.enable");
|
|
|
|
|
|
2024-09-06 16:24:18 -07:00
|
|
|
cdp.on(
|
2024-03-27 09:26:51 -07:00
|
|
|
"Runtime.executionContextCreated",
|
|
|
|
|
(params: Protocol.Runtime.ExecutionContextCreatedEvent) => {
|
|
|
|
|
const { id, auxData } = params.context;
|
|
|
|
|
if (auxData && auxData.isDefault && auxData.frameId) {
|
|
|
|
|
frameIdToExecId.set(auxData.frameId, id);
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
|
2024-09-06 16:24:18 -07:00
|
|
|
cdp.on(
|
2024-03-27 09:26:51 -07:00
|
|
|
"Runtime.executionContextDestroyed",
|
|
|
|
|
(params: Protocol.Runtime.ExecutionContextDestroyedEvent) => {
|
|
|
|
|
const { executionContextId } = params;
|
|
|
|
|
for (const [frameId, execId] of frameIdToExecId.entries()) {
|
|
|
|
|
if (execId === executionContextId) {
|
|
|
|
|
frameIdToExecId.delete(frameId);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
|
2024-09-06 16:24:18 -07:00
|
|
|
cdp.on("Runtime.executionContextsCleared", () => {
|
2024-03-27 09:26:51 -07:00
|
|
|
frameIdToExecId.clear();
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
async loadCustomBehaviors(sources: string[]) {
|
2023-07-06 16:09:48 -04:00
|
|
|
let str = "";
|
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
for (const { contents } of await collectCustomBehaviors(sources)) {
|
2023-12-13 12:14:53 -08:00
|
|
|
str += `self.__bx_behaviors.load(${contents});\n`;
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return str;
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-13 12:14:53 -08:00
|
|
|
async checkBehaviorScripts(cdp: CDPSession) {
|
2024-11-04 23:30:53 -05:00
|
|
|
const sources = this.params.customBehaviors;
|
2023-12-13 12:14:53 -08:00
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
if (!sources) {
|
2023-12-13 12:14:53 -08:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
for (const { path, contents } of await collectCustomBehaviors(sources)) {
|
2023-12-13 12:14:53 -08:00
|
|
|
await this.browser.checkScript(cdp, path, contents);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async getFavicon(page: Page, logDetails: LogDetails): Promise<string> {
|
2023-09-17 12:50:39 -07:00
|
|
|
try {
|
|
|
|
|
const resp = await fetch("http://127.0.0.1:9221/json");
|
|
|
|
|
if (resp.status === 200) {
|
|
|
|
|
const browserJson = await resp.json();
|
|
|
|
|
for (const jsons of browserJson) {
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
if (jsons.id === (page.target() as any)._targetId) {
|
2023-09-17 12:50:39 -07:00
|
|
|
return jsons.faviconUrl;
|
|
|
|
|
}
|
2023-09-10 11:29:35 -07:00
|
|
|
}
|
|
|
|
|
}
|
2023-09-17 12:50:39 -07:00
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
2023-09-10 11:29:35 -07:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Failed to fetch favicon from browser /json endpoint",
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
|
|
|
|
return "";
|
2023-09-10 11:29:35 -07:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
async crawlPage(opts: WorkerState): Promise<void> {
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.writeStats();
|
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
const { page, cdp, data, workerid, callbacks, recorder } = opts;
|
2023-09-15 10:12:08 -07:00
|
|
|
data.callbacks = callbacks;
|
2023-02-23 18:50:22 -08:00
|
|
|
|
2024-06-20 16:35:30 -07:00
|
|
|
const { url, seedId } = data;
|
|
|
|
|
|
|
|
|
|
const auth = this.seeds[seedId].authHeader();
|
|
|
|
|
|
|
|
|
|
if (auth) {
|
|
|
|
|
logger.debug("Setting HTTP basic auth for seed", {
|
|
|
|
|
seedId,
|
|
|
|
|
seedUrl: this.seeds[seedId].url,
|
|
|
|
|
});
|
|
|
|
|
}
|
2023-03-13 18:07:59 -04:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const logDetails = { page: url, workerid };
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.logDetails = logDetails;
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
data.workerid = workerid;
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2025-09-10 12:05:21 -07:00
|
|
|
let result = false;
|
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
if (recorder) {
|
2023-11-07 21:38:50 -08:00
|
|
|
try {
|
2024-09-05 13:28:49 -07:00
|
|
|
const headers = auth
|
|
|
|
|
? { Authorization: auth, ...this.headers }
|
|
|
|
|
: this.headers;
|
|
|
|
|
|
2025-09-10 12:05:21 -07:00
|
|
|
result = await timedRun(
|
|
|
|
|
recorder.directFetchCapture({
|
|
|
|
|
url,
|
|
|
|
|
headers,
|
|
|
|
|
cdp,
|
|
|
|
|
state: data,
|
|
|
|
|
crawler: this,
|
|
|
|
|
}),
|
2024-09-05 18:10:27 -07:00
|
|
|
this.params.pageLoadTimeout,
|
2024-09-05 13:28:49 -07:00
|
|
|
"Direct fetch of page URL timed out",
|
2023-11-07 21:38:50 -08:00
|
|
|
logDetails,
|
|
|
|
|
"fetch",
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
2025-09-10 12:05:21 -07:00
|
|
|
logger.error(
|
|
|
|
|
"Direct fetch of page URL failed",
|
|
|
|
|
{ e, ...logDetails },
|
|
|
|
|
"fetch",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!result) {
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Direct fetch response not accepted, continuing with browser fetch",
|
|
|
|
|
logDetails,
|
|
|
|
|
"fetch",
|
|
|
|
|
);
|
|
|
|
|
} else {
|
|
|
|
|
return;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-05 13:28:49 -07:00
|
|
|
opts.markPageUsed();
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
opts.pageBlockUnload = false;
|
2024-09-05 13:28:49 -07:00
|
|
|
|
|
|
|
|
if (auth) {
|
|
|
|
|
await page.setExtraHTTPHeaders({ Authorization: auth });
|
|
|
|
|
opts.isAuthSet = true;
|
|
|
|
|
} else if (opts.isAuthSet) {
|
|
|
|
|
await page.setExtraHTTPHeaders({});
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
const seed = await this.crawlState.getSeedAt(
|
|
|
|
|
this.seeds,
|
|
|
|
|
this.numOriginalSeeds,
|
|
|
|
|
seedId,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (recorder) {
|
|
|
|
|
recorder.pageSeed = seed;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-08 08:04:41 -08:00
|
|
|
// run custom driver here, if any
|
|
|
|
|
if (this.driver) {
|
2025-01-28 11:28:23 -08:00
|
|
|
await this.driver({ page, data, crawler: this, seed });
|
2024-11-08 08:04:41 -08:00
|
|
|
} else {
|
2025-01-28 11:28:23 -08:00
|
|
|
await this.loadPage(page, data, seed);
|
2024-11-08 08:04:41 -08:00
|
|
|
}
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2024-09-27 14:30:25 -04:00
|
|
|
data.title = await timedRun(
|
|
|
|
|
page.title(),
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Timed out getting page title, something is likely wrong",
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
2023-09-10 11:29:35 -07:00
|
|
|
data.favicon = await this.getFavicon(page, logDetails);
|
2021-02-08 22:21:34 -08:00
|
|
|
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
opts.pageBlockUnload = true;
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
await this.doPostLoadActions(opts);
|
2024-06-26 09:16:24 -07:00
|
|
|
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
opts.pageBlockUnload = false;
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
await this.awaitPageExtraDelay(opts);
|
2024-03-22 17:32:42 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async doPostLoadActions(opts: WorkerState, saveOutput = false) {
|
|
|
|
|
const { page, cdp, data, workerid } = opts;
|
|
|
|
|
const { url } = data;
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!data.isHTMLPage) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
const logDetails = { page: url, workerid };
|
2023-10-31 23:05:30 -07:00
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
if (this.params.screenshot && this.screenshotWriter) {
|
2023-11-09 11:27:11 -08:00
|
|
|
const screenshots = new Screenshots({
|
|
|
|
|
browser: this.browser,
|
|
|
|
|
page,
|
|
|
|
|
url,
|
2024-03-26 14:54:27 -07:00
|
|
|
writer: this.screenshotWriter,
|
2023-11-09 11:27:11 -08:00
|
|
|
});
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.screenshot.includes("view")) {
|
2024-03-22 17:32:42 -07:00
|
|
|
await screenshots.take("view", saveOutput ? data : null);
|
2021-03-13 16:48:31 -08:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.screenshot.includes("fullPage")) {
|
|
|
|
|
await screenshots.takeFullPage();
|
|
|
|
|
}
|
|
|
|
|
if (this.params.screenshot.includes("thumbnail")) {
|
|
|
|
|
await screenshots.takeThumbnail();
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2023-10-31 23:05:30 -07:00
|
|
|
let textextract = null;
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (this.textWriter) {
|
2023-11-09 11:27:11 -08:00
|
|
|
textextract = new TextExtractViaSnapshot(cdp, {
|
2024-03-26 14:54:27 -07:00
|
|
|
writer: this.textWriter,
|
2023-11-09 11:27:11 -08:00
|
|
|
url,
|
2024-03-22 17:32:42 -07:00
|
|
|
skipDocs: this.skipTextDocs,
|
2023-11-09 11:27:11 -08:00
|
|
|
});
|
2024-03-22 17:32:42 -07:00
|
|
|
const { text } = await textextract.extractAndStoreText(
|
2023-11-09 11:27:11 -08:00
|
|
|
"text",
|
|
|
|
|
false,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.text.includes("to-warc"),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-10-31 23:05:30 -07:00
|
|
|
|
2024-08-09 16:20:56 -04:00
|
|
|
if (text !== null && (this.textInPages || saveOutput)) {
|
2023-10-31 23:05:30 -07:00
|
|
|
data.text = text;
|
|
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
2022-12-21 12:06:13 -05:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.loadState = LoadState.EXTRACTION_DONE;
|
2022-12-21 12:06:13 -05:00
|
|
|
|
2024-03-28 10:21:31 -07:00
|
|
|
if (this.params.behaviorOpts && data.status < 400) {
|
2024-06-26 09:16:24 -07:00
|
|
|
if (data.skipBehaviors) {
|
2025-04-09 12:24:29 +02:00
|
|
|
logger.warn("Skipping behaviors for slow page", logDetails, "behavior");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
} else {
|
2025-11-19 15:49:49 -08:00
|
|
|
// allow failing crawl via script from within behaviors also
|
|
|
|
|
data.contentCheckAllowed = true;
|
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const res = await timedRun(
|
2024-03-27 09:26:51 -07:00
|
|
|
this.runBehaviors(
|
|
|
|
|
page,
|
|
|
|
|
cdp,
|
|
|
|
|
data.filteredFrames,
|
|
|
|
|
opts.frameIdToExecId,
|
|
|
|
|
logDetails,
|
|
|
|
|
),
|
2023-03-22 14:50:18 -04:00
|
|
|
this.params.behaviorTimeout,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
"Behaviors timed out",
|
|
|
|
|
logDetails,
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2024-06-13 15:42:27 -04:00
|
|
|
true,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
);
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2025-11-19 15:49:49 -08:00
|
|
|
data.contentCheckAllowed = false;
|
|
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
await this.netIdle(page, logDetails);
|
|
|
|
|
|
|
|
|
|
if (res) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.loadState = LoadState.BEHAVIORS_DONE;
|
2022-03-22 17:41:51 -07:00
|
|
|
}
|
2023-10-31 23:05:30 -07:00
|
|
|
|
|
|
|
|
if (textextract && this.params.text.includes("final-to-warc")) {
|
|
|
|
|
await textextract.extractAndStoreText("textFinal", true, true);
|
|
|
|
|
}
|
2024-11-24 06:26:55 +01:00
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
this.params.screenshot &&
|
|
|
|
|
this.screenshotWriter &&
|
|
|
|
|
this.params.screenshot.includes("fullPageFinal")
|
|
|
|
|
) {
|
2025-01-30 17:39:20 -08:00
|
|
|
await timedRun(
|
|
|
|
|
page.evaluate(() => {
|
|
|
|
|
window.scrollTo(0, 0);
|
|
|
|
|
}),
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Page scroll timed out",
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
|
|
|
|
|
2024-11-24 06:26:55 +01:00
|
|
|
const screenshots = new Screenshots({
|
|
|
|
|
browser: this.browser,
|
|
|
|
|
page,
|
|
|
|
|
url,
|
|
|
|
|
writer: this.screenshotWriter,
|
|
|
|
|
});
|
|
|
|
|
await screenshots.takeFullPageFinal();
|
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
2024-06-26 09:16:24 -07:00
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
async awaitPageExtraDelay(opts: WorkerState) {
|
2023-03-22 14:50:18 -04:00
|
|
|
if (this.params.pageExtraDelay) {
|
2024-06-26 09:16:24 -07:00
|
|
|
const {
|
|
|
|
|
data: { url: page },
|
|
|
|
|
workerid,
|
|
|
|
|
} = opts;
|
|
|
|
|
|
|
|
|
|
const logDetails = { page, workerid };
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
|
|
|
|
`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-03-22 14:50:18 -04:00
|
|
|
await sleep(this.params.pageExtraDelay);
|
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2025-08-20 16:07:29 -07:00
|
|
|
async pageFinished(data: PageState, lastErrorText = "") {
|
2025-09-10 12:05:21 -07:00
|
|
|
// not yet finished
|
|
|
|
|
if (data.asyncLoading) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
// if page loaded, considered page finished successfully
|
|
|
|
|
// (even if behaviors timed out)
|
2025-02-06 20:13:20 -08:00
|
|
|
const { loadState, logDetails, depth, url, pageSkipped } = data;
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
|
|
|
|
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
2025-01-25 22:55:49 -08:00
|
|
|
await this.writePage(data);
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info("Page Finished", { loadState, ...logDetails }, "pageStatus");
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-09-05 13:28:49 -07:00
|
|
|
await this.crawlState.markFinished(url);
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
|
|
|
|
if (this.healthChecker) {
|
|
|
|
|
this.healthChecker.resetErrors();
|
|
|
|
|
}
|
2024-09-05 13:28:49 -07:00
|
|
|
|
|
|
|
|
await this.serializeConfig();
|
|
|
|
|
|
|
|
|
|
await this.checkLimits();
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
} else {
|
2025-01-28 11:28:23 -08:00
|
|
|
if (pageSkipped) {
|
|
|
|
|
await this.crawlState.markExcluded(url);
|
|
|
|
|
} else {
|
2025-02-06 18:48:40 -08:00
|
|
|
const retry = await this.crawlState.markFailed(url);
|
2022-03-14 10:41:56 -07:00
|
|
|
|
2025-02-06 20:13:20 -08:00
|
|
|
if (this.healthChecker) {
|
|
|
|
|
this.healthChecker.incError();
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-06 18:48:40 -08:00
|
|
|
if (retry < 0) {
|
2025-02-06 20:13:20 -08:00
|
|
|
await this.writePage(data);
|
2025-02-06 18:48:40 -08:00
|
|
|
|
|
|
|
|
await this.serializeConfig();
|
2024-09-05 13:28:49 -07:00
|
|
|
|
2025-02-06 18:48:40 -08:00
|
|
|
if (depth === 0 && this.params.failOnFailedSeed) {
|
2025-08-20 16:07:29 -07:00
|
|
|
let errorCode = ExitCodes.GenericError;
|
|
|
|
|
|
|
|
|
|
switch (lastErrorText) {
|
|
|
|
|
case "net::ERR_SOCKS_CONNECTION_FAILED":
|
|
|
|
|
case "net::SOCKS_CONNECTION_HOST_UNREACHABLE":
|
|
|
|
|
case "net::ERR_PROXY_CONNECTION_FAILED":
|
|
|
|
|
case "net::ERR_TUNNEL_CONNECTION_FAILED":
|
|
|
|
|
errorCode = ExitCodes.ProxyError;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case "net::ERR_TIMED_OUT":
|
|
|
|
|
case "net::ERR_INVALID_AUTH_CREDENTIALS":
|
|
|
|
|
if (this.proxyServer || this.proxyPacUrl) {
|
|
|
|
|
errorCode = ExitCodes.ProxyError;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2025-02-06 18:48:40 -08:00
|
|
|
logger.fatal(
|
|
|
|
|
"Seed Page Load Failed, failing crawl",
|
|
|
|
|
{},
|
|
|
|
|
"general",
|
2025-08-20 16:07:29 -07:00
|
|
|
errorCode,
|
2025-02-06 18:48:40 -08:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-09-05 13:28:49 -07:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
2024-09-05 13:28:49 -07:00
|
|
|
await this.checkLimits();
|
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
|
|
|
|
|
Autoclick Support (#729)
Adds support for autoclick behavior:
- Adds new `autoclick` behavior option to `--behaviors`, but not
enabling by default
- Adds support for new exposed function `__bx_addSet` which allows
autoclick behavior to persist state about links that have already been
clicked to avoid duplicates, only used if link has an href
- Adds a new pageFinished flag on the worker state.
- Adds a on('dialog') handler to reject onbeforeunload page navigations,
when in behavior (page not finished), but accept when page is finished -
to allow navigation away only when behaviors are done
- Update to browsertrix-behaviors 0.7.0, which supports autoclick
- Add --clickSelector option to customize elements that will be clicked,
defaulting to `a`.
- Add --linkSelector as alias for --selectLinks for consistency
- Unknown options for --behaviors printed as warnings, instead of hard
exit, for forward compatibility for new behavior types in the future
Fixes #728, also #216, #665, #31
2025-01-16 09:38:11 -08:00
|
|
|
async teardownPage({ workerid }: WorkerState) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
if (this.screencaster) {
|
|
|
|
|
await this.screencaster.stopById(workerid);
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2023-03-13 14:48:04 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async workerIdle(workerid: WorkerId) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
if (this.screencaster) {
|
|
|
|
|
//logger.debug("End Screencast", {workerid}, "screencast");
|
|
|
|
|
await this.screencaster.stopById(workerid, true);
|
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async runBehaviors(
|
|
|
|
|
page: Page,
|
|
|
|
|
cdp: CDPSession,
|
|
|
|
|
frames: Frame[],
|
2024-03-27 09:26:51 -07:00
|
|
|
frameIdToExecId: Map<string, number>,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2023-03-08 21:31:19 -05:00
|
|
|
try {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
frames = frames || page.frames();
|
|
|
|
|
|
2025-04-09 12:24:29 +02:00
|
|
|
logger.debug(
|
2023-11-09 11:27:11 -08:00
|
|
|
"Running behaviors",
|
|
|
|
|
{
|
|
|
|
|
frames: frames.length,
|
|
|
|
|
frameUrls: frames.map((frame) => frame.url()),
|
|
|
|
|
...logDetails,
|
|
|
|
|
},
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
const results = await Promise.allSettled(
|
2023-11-09 11:27:11 -08:00
|
|
|
frames.map((frame) =>
|
|
|
|
|
this.browser.evaluateWithCLI(
|
|
|
|
|
cdp,
|
2024-03-27 09:26:51 -07:00
|
|
|
frame,
|
|
|
|
|
frameIdToExecId,
|
2023-11-09 11:27:11 -08:00
|
|
|
`
|
2023-09-20 14:02:37 -05:00
|
|
|
if (!self.__bx_behaviors) {
|
|
|
|
|
console.error("__bx_behaviors missing, can't run behaviors");
|
|
|
|
|
} else {
|
|
|
|
|
self.__bx_behaviors.run();
|
2023-11-09 11:27:11 -08:00
|
|
|
}`,
|
|
|
|
|
logDetails,
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
|
|
|
|
),
|
|
|
|
|
),
|
2023-03-08 21:31:19 -05:00
|
|
|
);
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
for (const res of results) {
|
2024-03-27 09:26:51 -07:00
|
|
|
const { status, reason }: { status: string; reason?: unknown } = res;
|
2023-09-14 19:48:41 -07:00
|
|
|
if (status === "rejected") {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Behavior run partially failed",
|
2024-03-27 09:26:51 -07:00
|
|
|
{ reason: formatErr(reason), ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-14 19:48:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-09 12:24:29 +02:00
|
|
|
logger.debug(
|
2023-11-09 11:27:11 -08:00
|
|
|
"Behaviors finished",
|
|
|
|
|
{ finished: results.length, ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-14 19:48:41 -07:00
|
|
|
return true;
|
2023-03-08 21:31:19 -05:00
|
|
|
} catch (e) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Behavior run failed",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ ...formatErr(e), ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-14 19:48:41 -07:00
|
|
|
return false;
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async shouldIncludeFrame(frame: Frame, logDetails: LogDetails) {
|
2023-01-23 16:47:33 -08:00
|
|
|
if (!frame.parentFrame()) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return frame;
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
const frameUrl = frame.url();
|
2023-01-23 16:47:33 -08:00
|
|
|
|
2024-05-09 11:05:33 +02:00
|
|
|
if (!frameUrl) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
|
|
|
|
|
// if there's no tag or an iframe tag, then assume its a regular frame
|
2024-05-09 11:05:33 +02:00
|
|
|
let tagName = "";
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
tagName = await timedRun(
|
|
|
|
|
frame.evaluate(
|
|
|
|
|
"self && self.frameElement && self.frameElement.tagName",
|
|
|
|
|
),
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Frame check timed out",
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
"Skipping processing non-frame object",
|
|
|
|
|
{ tagName, frameUrl, ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
let res;
|
|
|
|
|
|
|
|
|
|
if (frameUrl === "about:blank") {
|
|
|
|
|
res = false;
|
|
|
|
|
} else {
|
2023-11-09 11:27:11 -08:00
|
|
|
res = this.adBlockRules && !this.adBlockRules.isAdUrl(frameUrl);
|
2023-02-23 18:50:22 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!res) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
"Skipping processing frame",
|
|
|
|
|
{ frameUrl, ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return res ? frame : null;
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
|
2024-09-10 08:28:07 -07:00
|
|
|
async updateCurrSize(): Promise<number> {
|
|
|
|
|
if (this.params.dryRun) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2024-09-10 08:28:07 -07:00
|
|
|
const size = await getDirSize(this.archivesDir);
|
2023-04-19 21:10:02 -04:00
|
|
|
|
|
|
|
|
await this.crawlState.setArchiveSize(size);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2024-09-10 08:28:07 -07:00
|
|
|
return size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async checkLimits() {
|
2025-02-10 23:00:55 +01:00
|
|
|
let interrupt: InterruptReason | null = null;
|
2024-09-10 08:28:07 -07:00
|
|
|
|
|
|
|
|
const size = await this.updateCurrSize();
|
|
|
|
|
|
2023-03-31 12:35:18 -04:00
|
|
|
if (this.params.sizeLimit) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (size >= this.params.sizeLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
2023-11-09 19:11:11 -05:00
|
|
|
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2025-02-10 23:00:55 +01:00
|
|
|
interrupt = InterruptReason.SizeLimit;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.timeLimit) {
|
2023-03-22 14:50:18 -04:00
|
|
|
const elapsed = secondsElapsed(this.startTime);
|
|
|
|
|
if (elapsed >= this.params.timeLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
2023-11-09 19:11:11 -05:00
|
|
|
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2025-02-10 23:00:55 +01:00
|
|
|
interrupt = InterruptReason.TimeLimit;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-31 12:35:18 -04:00
|
|
|
if (this.params.diskUtilization) {
|
2023-07-06 00:58:28 -04:00
|
|
|
// Check that disk usage isn't already or soon to be above threshold
|
2024-06-07 19:13:15 +02:00
|
|
|
const diskUtil = await checkDiskUtilization(
|
|
|
|
|
this.collDir,
|
|
|
|
|
this.params,
|
|
|
|
|
size,
|
|
|
|
|
);
|
2023-07-06 00:58:28 -04:00
|
|
|
if (diskUtil.stop === true) {
|
2025-02-10 23:00:55 +01:00
|
|
|
interrupt = InterruptReason.DiskUtilization;
|
2023-03-31 12:35:18 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-03 20:21:30 -07:00
|
|
|
if (this.params.failOnFailedLimit) {
|
2025-02-06 18:48:40 -08:00
|
|
|
const numFailed = await this.crawlState.numFailed();
|
2024-05-21 19:35:43 -04:00
|
|
|
const failedLimit = this.params.failOnFailedLimit;
|
|
|
|
|
if (numFailed >= failedLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.fatal(
|
2024-05-21 19:35:43 -04:00
|
|
|
`Failed threshold reached ${numFailed} >= ${failedLimit}, failing crawl`,
|
2025-02-10 23:00:55 +01:00
|
|
|
{},
|
|
|
|
|
"general",
|
|
|
|
|
ExitCodes.FailedLimit,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-10-03 20:21:30 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-05 10:10:08 -07:00
|
|
|
if (await this.crawlState.isCrawlPaused()) {
|
|
|
|
|
interrupt = InterruptReason.CrawlPaused;
|
|
|
|
|
}
|
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (interrupt) {
|
2023-08-15 11:34:39 -07:00
|
|
|
this.uploadAndDeleteLocal = true;
|
2025-02-10 23:00:55 +01:00
|
|
|
this.gracefulFinishOnInterrupt(interrupt);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-10 23:00:55 +01:00
|
|
|
gracefulFinishOnInterrupt(interruptReason: InterruptReason) {
|
|
|
|
|
this.interruptReason = interruptReason;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Crawler interrupted, gracefully finishing current pages");
|
2023-09-13 10:48:21 -07:00
|
|
|
if (!this.params.waitOnDone && !this.params.restartsOnError) {
|
2022-09-20 17:09:52 -07:00
|
|
|
this.finalExit = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-09 12:28:58 -07:00
|
|
|
async checkCanceled() {
|
2023-11-09 11:27:11 -08:00
|
|
|
if (this.crawlState && (await this.crawlState.isCrawlCanceled())) {
|
2025-02-06 17:54:51 -08:00
|
|
|
await this.setStatusAndExit(ExitCodes.Success, "canceled");
|
2023-10-09 12:28:58 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-06 17:54:51 -08:00
|
|
|
async setStatusAndExit(exitCode: ExitCodes, status: string) {
|
2025-11-25 07:58:30 -08:00
|
|
|
await logger.interrupt("", {}, exitCode, status);
|
2023-10-02 20:55:52 -04:00
|
|
|
}
|
|
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
async serializeAndExit() {
|
|
|
|
|
await this.serializeConfig();
|
2023-10-02 20:55:52 -04:00
|
|
|
|
2025-02-10 23:00:55 +01:00
|
|
|
if (this.interruptReason) {
|
2024-03-21 13:56:05 -07:00
|
|
|
await closeWorkers(0);
|
2025-02-10 23:00:55 +01:00
|
|
|
await this.browser.close();
|
2024-03-26 14:54:27 -07:00
|
|
|
await this.closeFiles();
|
2025-02-10 23:00:55 +01:00
|
|
|
|
2024-07-23 18:50:26 -07:00
|
|
|
if (!this.done) {
|
2025-02-06 17:54:51 -08:00
|
|
|
await this.setStatusAndExit(
|
2025-02-10 23:00:55 +01:00
|
|
|
ExitCodes.SignalInterruptedForce,
|
2025-02-06 17:54:51 -08:00
|
|
|
"interrupted",
|
|
|
|
|
);
|
2024-07-23 18:50:26 -07:00
|
|
|
return;
|
|
|
|
|
}
|
2023-10-03 20:21:30 -07:00
|
|
|
}
|
2025-02-10 23:00:55 +01:00
|
|
|
|
2025-02-06 17:54:51 -08:00
|
|
|
await this.setStatusAndExit(ExitCodes.Success, "done");
|
2022-09-20 17:09:52 -07:00
|
|
|
}
|
|
|
|
|
|
2023-08-22 09:16:00 -07:00
|
|
|
async isCrawlRunning() {
|
2025-02-10 23:00:55 +01:00
|
|
|
if (this.interruptReason) {
|
2023-08-22 09:16:00 -07:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-09 12:28:58 -07:00
|
|
|
if (await this.crawlState.isCrawlCanceled()) {
|
2025-02-06 17:54:51 -08:00
|
|
|
await this.setStatusAndExit(ExitCodes.Success, "canceled");
|
2023-10-09 12:28:58 -07:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-22 09:16:00 -07:00
|
|
|
if (await this.crawlState.isCrawlStopped()) {
|
2023-09-13 10:48:21 -07:00
|
|
|
logger.info("Crawler is stopped");
|
2023-08-22 09:16:00 -07:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async crawl() {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (this.params.healthCheckPort) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.healthChecker = new HealthChecker(
|
|
|
|
|
this.params.healthCheckPort,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.workers,
|
2025-02-10 23:00:55 +01:00
|
|
|
this.browser,
|
2024-09-10 08:28:07 -07:00
|
|
|
async () => {
|
|
|
|
|
await this.updateCurrSize();
|
|
|
|
|
},
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
|
2024-11-08 08:04:41 -08:00
|
|
|
if (this.params.driver) {
|
|
|
|
|
try {
|
|
|
|
|
const driverUrl = new URL(this.params.driver, import.meta.url);
|
|
|
|
|
this.driver = (await import(driverUrl.href)).default;
|
|
|
|
|
} catch (e) {
|
|
|
|
|
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
|
2022-06-30 19:24:26 -07:00
|
|
|
let initState = await this.crawlState.getStatus();
|
|
|
|
|
|
|
|
|
|
while (initState === "debug") {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Paused for debugging, will continue after manual resume");
|
2022-06-30 19:24:26 -07:00
|
|
|
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(60);
|
2022-06-30 19:24:26 -07:00
|
|
|
|
|
|
|
|
initState = await this.crawlState.getStatus();
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-08 23:39:26 -07:00
|
|
|
// if already done, don't crawl anymore
|
|
|
|
|
if (initState === "done") {
|
|
|
|
|
this.done = true;
|
|
|
|
|
|
|
|
|
|
if (this.params.waitOnDone) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Already done, waiting for signal to exit...");
|
2022-09-08 23:39:26 -07:00
|
|
|
|
|
|
|
|
// wait forever until signal
|
|
|
|
|
await new Promise(() => {});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-29 19:57:25 -07:00
|
|
|
if (this.params.generateWACZ || this.params.saveProfile) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
this.storage = initStorage();
|
2025-10-29 19:57:25 -07:00
|
|
|
}
|
2025-01-29 18:15:28 -08:00
|
|
|
|
2025-10-29 19:57:25 -07:00
|
|
|
if (this.params.generateWACZ && this.storage) {
|
|
|
|
|
await this.crawlState.setWACZFilename();
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
|
|
|
|
|
2023-05-03 16:25:59 -07:00
|
|
|
if (POST_CRAWL_STATES.includes(initState)) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info("crawl already finished, running post-crawl tasks", {
|
|
|
|
|
state: initState,
|
|
|
|
|
});
|
2025-01-25 22:55:49 -08:00
|
|
|
this.finalExit = true;
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.postCrawl();
|
|
|
|
|
return;
|
|
|
|
|
} else if (await this.crawlState.isCrawlStopped()) {
|
|
|
|
|
logger.info("crawl stopped, running post-crawl tasks");
|
2023-09-13 10:48:21 -07:00
|
|
|
this.finalExit = true;
|
2023-03-08 21:31:19 -05:00
|
|
|
await this.postCrawl();
|
|
|
|
|
return;
|
2023-10-09 12:28:58 -07:00
|
|
|
} else if (await this.crawlState.isCrawlCanceled()) {
|
|
|
|
|
logger.info("crawl canceled, will exit");
|
|
|
|
|
return;
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
2022-06-30 19:24:26 -07:00
|
|
|
|
2024-09-10 08:28:07 -07:00
|
|
|
await this.checkLimits();
|
|
|
|
|
|
2022-06-30 19:24:26 -07:00
|
|
|
await this.crawlState.setStatus("running");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
this.pagesFH = await this.initPages(this.seedPagesFile, "Seed Pages");
|
|
|
|
|
this.extraPagesFH = await this.initPages(
|
|
|
|
|
this.otherPagesFile,
|
|
|
|
|
"Non-Seed Pages",
|
|
|
|
|
);
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
this.adBlockRules = new AdBlockRules(
|
|
|
|
|
this.captureBasePrefix,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.adBlockMessage,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-10-25 10:53:32 -04:00
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
if (this.params.blockRules && this.params.blockRules.length) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.blockRules = new BlockRules(
|
2024-09-05 18:10:27 -07:00
|
|
|
this.params.blockRules as BlockRuleDecl[],
|
2023-11-09 11:27:11 -08:00
|
|
|
this.captureBasePrefix,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.blockMessage,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
this.screencaster = this.initScreenCaster();
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
if (this.params.originOverride && this.params.originOverride.length) {
|
2024-09-05 18:10:27 -07:00
|
|
|
this.originOverride = new OriginOverride(
|
|
|
|
|
this.params.originOverride as string[],
|
|
|
|
|
);
|
2023-04-19 19:17:15 -07:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
await this._addInitialSeeds();
|
2020-11-14 21:55:02 +00:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.browser.launch({
|
2023-04-24 10:26:56 -07:00
|
|
|
profileUrl: this.params.profile,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
headless: this.params.headless,
|
|
|
|
|
emulateDevice: this.emulateDevice,
|
2024-03-22 13:37:14 -07:00
|
|
|
swOpt: this.params.serviceWorker,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
chromeOptions: {
|
2025-08-20 16:07:29 -07:00
|
|
|
proxyServer: this.proxyServer,
|
|
|
|
|
proxyPacUrl: this.proxyPacUrl,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
userAgent: this.emulateDevice.userAgent,
|
2023-11-09 11:27:11 -08:00
|
|
|
extraArgs: this.extraChromeArgs(),
|
2023-09-18 15:24:33 -07:00
|
|
|
},
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
ondisconnect: (err: any) => {
|
2025-02-10 10:16:25 -08:00
|
|
|
this.markBrowserCrashed();
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.error(
|
|
|
|
|
"Browser disconnected (crashed?), interrupting crawl",
|
|
|
|
|
err,
|
2023-11-09 19:11:11 -05:00
|
|
|
"browser",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
|
|
|
|
},
|
2024-03-22 17:32:42 -07:00
|
|
|
|
|
|
|
|
recording: this.recording,
|
2023-11-09 19:11:11 -05:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
2023-11-09 11:27:11 -08:00
|
|
|
} as any);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
// --------------
|
|
|
|
|
// Run Crawl Here!
|
2025-10-29 19:57:25 -07:00
|
|
|
await runWorkers(
|
|
|
|
|
this,
|
|
|
|
|
this.params.workers,
|
|
|
|
|
this.maxPageTime,
|
|
|
|
|
false,
|
|
|
|
|
!!this.params.saveProfile,
|
|
|
|
|
);
|
2023-03-22 14:50:18 -04:00
|
|
|
// --------------
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2025-10-29 19:57:25 -07:00
|
|
|
await this.browser.close();
|
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.serializeConfig(true);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
await this.closePages();
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
await this.closeFiles();
|
|
|
|
|
|
2023-09-15 00:16:19 +02:00
|
|
|
await this.writeStats();
|
2023-02-23 18:50:22 -08:00
|
|
|
|
2025-01-25 22:55:49 -08:00
|
|
|
// if crawl has been stopped or finished, mark as final exit for post-crawl tasks
|
|
|
|
|
if (
|
|
|
|
|
(await this.crawlState.isCrawlStopped()) ||
|
|
|
|
|
(await this.crawlState.isFinished())
|
|
|
|
|
) {
|
2023-09-13 10:48:21 -07:00
|
|
|
this.finalExit = true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
await this.postCrawl();
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
async closePages() {
|
|
|
|
|
if (this.pagesFH) {
|
|
|
|
|
try {
|
|
|
|
|
await new Promise<void>((resolve) =>
|
|
|
|
|
this.pagesFH!.close(() => resolve()),
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
|
|
|
|
} finally {
|
|
|
|
|
this.pagesFH = null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.extraPagesFH) {
|
|
|
|
|
try {
|
|
|
|
|
await new Promise<void>((resolve) =>
|
|
|
|
|
this.extraPagesFH!.close(() => resolve()),
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
|
|
|
|
} finally {
|
|
|
|
|
this.extraPagesFH = null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
async closeFiles() {
|
|
|
|
|
if (this.textWriter) {
|
|
|
|
|
await this.textWriter.flush();
|
|
|
|
|
}
|
|
|
|
|
if (this.screenshotWriter) {
|
|
|
|
|
await this.screenshotWriter.flush();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
protected async _addInitialSeeds() {
|
2024-06-13 17:18:06 -07:00
|
|
|
for (let i = 0; i < this.seeds.length; i++) {
|
|
|
|
|
const seed = this.seeds[i];
|
2024-03-22 17:32:42 -07:00
|
|
|
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
|
|
|
|
|
if (this.limitHit) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (seed.sitemap) {
|
|
|
|
|
await timedRun(
|
|
|
|
|
this.parseSitemap(seed, i),
|
|
|
|
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
|
|
|
|
"Sitemap initial fetch timed out",
|
|
|
|
|
{ sitemap: seed.sitemap, seed: seed.url },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
async postCrawl() {
|
2024-10-31 14:06:17 -07:00
|
|
|
this.postCrawling = true;
|
|
|
|
|
logger.info("Crawling done");
|
|
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
if (this.params.combineWARC && !this.params.dryRun) {
|
2021-03-31 13:41:27 -04:00
|
|
|
await this.combineWARC();
|
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2025-02-28 17:58:56 -08:00
|
|
|
const generateFiles =
|
|
|
|
|
!this.params.dryRun &&
|
|
|
|
|
(!this.interruptReason || this.finalExit || this.uploadAndDeleteLocal);
|
|
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
if (
|
|
|
|
|
(this.params.generateCDX || this.params.generateWACZ) &&
|
2025-02-28 17:58:56 -08:00
|
|
|
generateFiles
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
) {
|
|
|
|
|
logger.info("Merging CDX");
|
|
|
|
|
await this.crawlState.setStatus(
|
|
|
|
|
this.params.generateWACZ ? "generate-wacz" : "generate-cdx",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
await mergeCDXJ(
|
|
|
|
|
this.warcCdxDir,
|
|
|
|
|
this.indexesDir,
|
|
|
|
|
this.params.generateWACZ ? null : false,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2025-02-28 17:58:56 -08:00
|
|
|
if (this.params.generateWACZ && generateFiles) {
|
2023-08-01 00:04:10 -07:00
|
|
|
const uploaded = await this.generateWACZ();
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
2023-08-01 00:04:10 -07:00
|
|
|
if (uploaded && this.uploadAndDeleteLocal) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
2023-11-09 19:11:11 -05:00
|
|
|
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
try {
|
|
|
|
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-29 19:57:25 -07:00
|
|
|
if (this.finalExit && generateFiles && this.params.saveProfile) {
|
|
|
|
|
const resource = await this.browser.saveProfile(
|
|
|
|
|
this.params.saveProfile,
|
|
|
|
|
this.storage,
|
|
|
|
|
this.params.saveProfile,
|
|
|
|
|
);
|
|
|
|
|
if (resource && resource.path) {
|
|
|
|
|
await this.crawlState.markProfileUploaded(resource);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-10 23:00:55 +01:00
|
|
|
if (this.params.waitOnDone && (!this.interruptReason || this.finalExit)) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
this.done = true;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("All done, waiting for signal...");
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
await this.crawlState.setStatus("done");
|
|
|
|
|
|
|
|
|
|
// wait forever until signal
|
|
|
|
|
await new Promise(() => {});
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
}
|
2022-01-26 16:06:10 -08:00
|
|
|
|
2025-02-10 10:16:25 -08:00
|
|
|
markBrowserCrashed() {
|
2025-02-10 23:00:55 +01:00
|
|
|
this.interruptReason = InterruptReason.BrowserCrashed;
|
|
|
|
|
this.browser.crashed = true;
|
2025-02-10 10:16:25 -08:00
|
|
|
}
|
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
async generateWACZ() {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating WACZ");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-wacz");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// Get a list of the warcs inside
|
2024-03-22 17:32:42 -07:00
|
|
|
const warcFileList = await fsp.readdir(this.archivesDir);
|
2022-02-08 15:31:55 -08:00
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
// is finished (>0 pages and all pages written)
|
|
|
|
|
const isFinished = await this.crawlState.isFinished();
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Num WARC Files: ${warcFileList.length}`);
|
2022-02-08 15:31:55 -08:00
|
|
|
if (!warcFileList.length) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
// if finished, just return
|
2023-11-09 11:27:11 -08:00
|
|
|
if (isFinished || (await this.crawlState.isCrawlCanceled())) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
return;
|
|
|
|
|
}
|
2025-05-05 10:10:08 -07:00
|
|
|
// possibly restarted after committing, so assume done here!
|
|
|
|
|
if ((await this.crawlState.numDone()) > 0) {
|
|
|
|
|
return;
|
2023-05-19 07:38:16 -07:00
|
|
|
}
|
2023-10-03 20:21:30 -07:00
|
|
|
// fail crawl otherwise
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.fatal("No WARC Files, assuming crawl failed");
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
const waczPath = path.join(this.collDir, this.params.collection + ".wacz");
|
2023-09-13 10:04:09 -07:00
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
const streaming = !!this.storage;
|
2022-02-08 15:31:55 -08:00
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
if (!streaming) {
|
|
|
|
|
logger.debug("WACZ will be written to disk", { path: waczPath }, "wacz");
|
|
|
|
|
} else {
|
|
|
|
|
logger.debug("WACZ will be stream uploaded to remote storage");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logger.debug("End of log file in WACZ, storing logs to WACZ file");
|
|
|
|
|
|
2025-11-25 07:58:30 -08:00
|
|
|
await logger.closeLog();
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
|
|
|
|
|
const waczOpts: WACZInitOpts = {
|
|
|
|
|
input: warcFileList.map((x) => path.join(this.archivesDir, x)),
|
|
|
|
|
output: waczPath,
|
|
|
|
|
pages: this.pagesDir,
|
|
|
|
|
logDirectory: this.logDir,
|
|
|
|
|
warcCdxDir: this.warcCdxDir,
|
|
|
|
|
indexesDir: this.indexesDir,
|
|
|
|
|
softwareString: this.infoString,
|
|
|
|
|
};
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
|
if (process.env.WACZ_SIGN_URL) {
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
waczOpts.signingUrl = process.env.WACZ_SIGN_URL;
|
2022-02-08 15:31:55 -08:00
|
|
|
if (process.env.WACZ_SIGN_TOKEN) {
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
waczOpts.signingToken = "bearer " + process.env.WACZ_SIGN_TOKEN;
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2022-02-08 15:31:55 -08:00
|
|
|
|
2023-04-04 10:46:03 -04:00
|
|
|
if (this.params.title) {
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
waczOpts.title = this.params.title;
|
2023-04-04 10:46:03 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.description) {
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
waczOpts.description = this.params.description;
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
try {
|
|
|
|
|
const wacz = new WACZ(waczOpts, this.collDir);
|
|
|
|
|
if (!streaming) {
|
|
|
|
|
await wacz.generateToFile(waczPath);
|
|
|
|
|
}
|
2022-02-08 15:31:55 -08:00
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
if (this.storage) {
|
|
|
|
|
await this.crawlState.setStatus("uploading-wacz");
|
2025-01-29 18:15:28 -08:00
|
|
|
|
|
|
|
|
const targetFilename = await this.crawlState.getWACZFilename();
|
2022-02-08 15:31:55 -08:00
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
|
2025-01-29 18:15:28 -08:00
|
|
|
|
|
|
|
|
await this.crawlState.clearWACZFilename();
|
|
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
return true;
|
|
|
|
|
}
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
return false;
|
|
|
|
|
} catch (e) {
|
|
|
|
|
logger.error("Error creating WACZ", e);
|
|
|
|
|
if (!streaming) {
|
2025-11-25 07:58:30 -08:00
|
|
|
await logger.interrupt(
|
|
|
|
|
"Unable to write WACZ successfully",
|
|
|
|
|
formatErr(e),
|
|
|
|
|
ExitCodes.GenericError,
|
|
|
|
|
);
|
2025-07-29 15:41:22 -07:00
|
|
|
} else if (this.params.restartsOnError) {
|
2025-11-25 07:58:30 -08:00
|
|
|
await logger.interrupt(
|
|
|
|
|
"Unable to upload WACZ successfully",
|
|
|
|
|
formatErr(e),
|
|
|
|
|
ExitCodes.UploadFailed,
|
|
|
|
|
);
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
}
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
logMemory() {
|
|
|
|
|
const memUsage = process.memoryUsage();
|
|
|
|
|
const { heapUsed, heapTotal } = memUsage;
|
|
|
|
|
this.maxHeapUsed = Math.max(this.maxHeapUsed || 0, heapUsed);
|
|
|
|
|
this.maxHeapTotal = Math.max(this.maxHeapTotal || 0, heapTotal);
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
"Memory",
|
|
|
|
|
{
|
|
|
|
|
maxHeapUsed: this.maxHeapUsed,
|
|
|
|
|
maxHeapTotal: this.maxHeapTotal,
|
|
|
|
|
...memUsage,
|
|
|
|
|
},
|
2023-11-14 21:54:40 -08:00
|
|
|
"memoryStatus",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-04-26 15:41:35 -07:00
|
|
|
}
|
|
|
|
|
|
2023-09-15 00:16:19 +02:00
|
|
|
async writeStats() {
|
2022-12-15 12:38:41 -05:00
|
|
|
if (!this.params.logging.includes("stats")) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2020-12-02 16:26:20 +00:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const realSize = await this.crawlState.queueSize();
|
2024-05-21 10:58:35 -07:00
|
|
|
const pendingPages = await this.crawlState.getPendingList();
|
2025-01-25 22:55:49 -08:00
|
|
|
const pending = pendingPages.length;
|
|
|
|
|
const crawled = await this.crawlState.numDone();
|
2025-02-06 18:48:40 -08:00
|
|
|
const failed = await this.crawlState.numFailed();
|
2025-02-09 15:26:36 -08:00
|
|
|
const total = realSize + pendingPages.length + crawled + failed;
|
2023-11-09 11:27:11 -08:00
|
|
|
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
|
2022-12-15 12:38:41 -05:00
|
|
|
const stats = {
|
2025-01-25 22:55:49 -08:00
|
|
|
crawled,
|
|
|
|
|
total,
|
|
|
|
|
pending,
|
|
|
|
|
failed,
|
|
|
|
|
limit,
|
2024-05-21 10:58:35 -07:00
|
|
|
pendingPages,
|
2022-12-15 12:38:41 -05:00
|
|
|
};
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Crawl statistics", stats, "crawlStatus");
|
2023-04-26 15:41:35 -07:00
|
|
|
this.logMemory();
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2023-09-15 00:16:19 +02:00
|
|
|
if (this.params.statsFilename) {
|
2020-12-02 16:26:20 +00:00
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
await fsp.writeFile(
|
|
|
|
|
this.params.statsFilename,
|
2023-11-09 19:11:11 -05:00
|
|
|
JSON.stringify(stats, null, 2),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (err) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Stats output failed", err);
|
2020-12-02 16:26:20 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-06 18:48:40 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
pageFailed(msg: string, retry: number, msgData: any) {
|
|
|
|
|
if (retry < this.params.maxPageRetries) {
|
|
|
|
|
logger.warn(
|
|
|
|
|
msg + ": will retry",
|
|
|
|
|
{ retry, retries: this.params.maxPageRetries, ...msgData },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
|
|
|
|
} else {
|
|
|
|
|
logger.error(
|
|
|
|
|
msg + ": retry limit reached",
|
|
|
|
|
{ retry, retries: this.params.maxPageRetries, ...msgData },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
throw new Error("logged");
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
async loadPage(page: Page, data: PageState, seed: ScopedSeed) {
|
2025-02-06 18:48:40 -08:00
|
|
|
const { url, depth, retry } = data;
|
2023-02-23 18:50:22 -08:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
const logDetails = data.logDetails;
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
// Attempt to load the page:
|
|
|
|
|
// - Already tried direct fetch w/o browser before getting here, and that resulted in an HTML page or non-200 response
|
|
|
|
|
// so now loading using the browser
|
|
|
|
|
// - If page.load() fails, but downloadResponse is set, then its a download, consider successful
|
|
|
|
|
// set page status to FULL_PAGE_LOADED (2)
|
|
|
|
|
// - If page.load() fails, but firstResponse is set to CONTENT_LOADED (1) state,
|
|
|
|
|
// consider a slow page, proceed to link extraction, but skip behaviors, issue warning
|
|
|
|
|
// - If page.load() fails otherwise and if failOnFailedSeed is set, fail crawl, otherwise fail page
|
|
|
|
|
// - If page.load() succeeds, check if page url is a chrome-error:// page, fail page (and or crawl if failOnFailedSeed and seed)
|
|
|
|
|
// - If at least one response, check if HTML, proceed with post-crawl actions only if HTML.
|
|
|
|
|
|
|
|
|
|
let downloadResponse: HTTPResponse | null = null;
|
|
|
|
|
let firstResponse: HTTPResponse | null = null;
|
|
|
|
|
let fullLoadedResponse: HTTPResponse | null = null;
|
|
|
|
|
|
|
|
|
|
// Detect if failure is actually caused by trying to load a non-page (eg. downloadable PDF),
|
|
|
|
|
// store the downloadResponse, if any
|
2023-11-09 11:27:11 -08:00
|
|
|
page.once("requestfailed", (req: HTTPRequest) => {
|
2024-06-26 09:16:24 -07:00
|
|
|
downloadResponse = getDownloadResponse(req);
|
2022-03-14 11:11:53 -07:00
|
|
|
});
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
// store the first successful non-redirect response, even if page doesn't load fully
|
|
|
|
|
const waitFirstResponse = (resp: HTTPResponse) => {
|
2025-01-28 11:28:23 -08:00
|
|
|
if (!isRedirectStatus(resp.status())) {
|
|
|
|
|
firstResponse = resp;
|
2024-06-26 09:16:24 -07:00
|
|
|
// don't listen to any additional responses
|
|
|
|
|
page.off("response", waitFirstResponse);
|
|
|
|
|
}
|
|
|
|
|
};
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2025-01-25 22:55:49 -08:00
|
|
|
const handleFirstLoadEvents = () => {
|
|
|
|
|
page.on("response", waitFirstResponse);
|
2024-06-26 09:16:24 -07:00
|
|
|
|
2025-01-25 22:55:49 -08:00
|
|
|
// store that domcontentloaded was finished
|
|
|
|
|
page.once("domcontentloaded", () => {
|
|
|
|
|
data.loadState = LoadState.CONTENT_LOADED;
|
|
|
|
|
});
|
|
|
|
|
};
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const gotoOpts = data.isHTMLPage
|
2023-11-09 11:27:11 -08:00
|
|
|
? this.gotoOpts
|
|
|
|
|
: { waitUntil: "domcontentloaded" };
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Awaiting page load", logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2025-01-25 22:55:49 -08:00
|
|
|
const urlNoHash = url.split("#")[0];
|
|
|
|
|
|
|
|
|
|
const fullRefresh = urlNoHash === page.url().split("#")[0];
|
|
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
try {
|
2025-01-25 22:55:49 -08:00
|
|
|
if (!fullRefresh) {
|
|
|
|
|
handleFirstLoadEvents();
|
|
|
|
|
}
|
2024-06-26 09:16:24 -07:00
|
|
|
// store the page load response when page fully loads
|
|
|
|
|
fullLoadedResponse = await page.goto(url, gotoOpts);
|
2025-01-25 22:55:49 -08:00
|
|
|
|
|
|
|
|
if (fullRefresh) {
|
|
|
|
|
logger.debug("Hashtag-only change, doing full page reload");
|
|
|
|
|
|
|
|
|
|
handleFirstLoadEvents();
|
|
|
|
|
|
|
|
|
|
fullLoadedResponse = await page.reload(gotoOpts);
|
|
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
|
|
|
|
if (!(e instanceof Error)) {
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
2023-09-18 15:24:33 -07:00
|
|
|
const msg = e.message || "";
|
2024-06-26 09:16:24 -07:00
|
|
|
|
|
|
|
|
// got firstResponse and content loaded, not a failure
|
|
|
|
|
if (firstResponse && data.loadState == LoadState.CONTENT_LOADED) {
|
2023-09-18 15:24:33 -07:00
|
|
|
// if timeout error, and at least got to content loaded, continue on
|
2024-06-26 09:16:24 -07:00
|
|
|
logger.warn(
|
|
|
|
|
"Page load timed out, loading but slowly, skipping behaviors",
|
|
|
|
|
{
|
2023-11-09 11:27:11 -08:00
|
|
|
msg,
|
|
|
|
|
...logDetails,
|
2024-06-26 09:16:24 -07:00
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
data.skipBehaviors = true;
|
|
|
|
|
} else if (!downloadResponse) {
|
2024-09-05 13:28:49 -07:00
|
|
|
// log if not already log and rethrow, consider page failed
|
|
|
|
|
if (msg !== "logged") {
|
2025-01-28 11:28:23 -08:00
|
|
|
const loadState = data.loadState;
|
|
|
|
|
|
|
|
|
|
// excluded in recorder
|
|
|
|
|
if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
|
|
|
|
|
data.pageSkipped = true;
|
|
|
|
|
logger.warn("Page Load Blocked, skipping", { msg, loadState });
|
|
|
|
|
} else {
|
2025-02-06 18:48:40 -08:00
|
|
|
return this.pageFailed("Page Load Failed", retry, {
|
2025-01-28 11:28:23 -08:00
|
|
|
msg,
|
2025-02-06 18:48:40 -08:00
|
|
|
url,
|
2025-01-28 11:28:23 -08:00
|
|
|
loadState,
|
|
|
|
|
...logDetails,
|
|
|
|
|
});
|
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2022-03-14 11:11:53 -07:00
|
|
|
}
|
2021-05-21 15:37:02 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const resp = fullLoadedResponse || downloadResponse || firstResponse;
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!resp) {
|
2025-02-06 18:48:40 -08:00
|
|
|
return this.pageFailed("Page Load Failed, no response", retry, {
|
|
|
|
|
url,
|
|
|
|
|
...logDetails,
|
|
|
|
|
});
|
2024-06-26 09:16:24 -07:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
2025-01-16 15:51:35 -08:00
|
|
|
const respUrl = resp.url().split("#")[0];
|
2024-06-26 09:16:24 -07:00
|
|
|
const isChromeError = page.url().startsWith("chrome-error://");
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2025-01-16 15:51:35 -08:00
|
|
|
if (
|
|
|
|
|
depth === 0 &&
|
|
|
|
|
!isChromeError &&
|
2025-01-25 22:55:49 -08:00
|
|
|
respUrl !== urlNoHash &&
|
|
|
|
|
respUrl + "/" !== url &&
|
2025-01-16 15:51:35 -08:00
|
|
|
!downloadResponse
|
|
|
|
|
) {
|
2024-06-26 09:16:24 -07:00
|
|
|
data.seedId = await this.crawlState.addExtraSeed(
|
|
|
|
|
this.seeds,
|
|
|
|
|
this.numOriginalSeeds,
|
|
|
|
|
data.seedId,
|
|
|
|
|
respUrl,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2024-06-26 09:16:24 -07:00
|
|
|
logger.info("Seed page redirected, adding redirected seed", {
|
|
|
|
|
origUrl: url,
|
|
|
|
|
newUrl: respUrl,
|
|
|
|
|
seedId: data.seedId,
|
|
|
|
|
});
|
|
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const status = resp.status();
|
|
|
|
|
data.status = status;
|
|
|
|
|
|
|
|
|
|
let failed = isChromeError;
|
|
|
|
|
|
|
|
|
|
if (this.params.failOnInvalidStatus && status >= 400) {
|
|
|
|
|
// Handle 4xx or 5xx response as a page load error
|
|
|
|
|
failed = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (failed) {
|
2025-02-06 18:48:40 -08:00
|
|
|
return this.pageFailed(
|
2024-09-05 13:28:49 -07:00
|
|
|
isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
|
2025-02-06 18:48:40 -08:00
|
|
|
retry,
|
|
|
|
|
{ url, status, ...logDetails },
|
2024-09-05 13:28:49 -07:00
|
|
|
);
|
2024-06-26 09:16:24 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const contentType = resp.headers()["content-type"];
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (contentType) {
|
|
|
|
|
data.mime = contentType.split(";")[0];
|
|
|
|
|
data.isHTMLPage = isHTMLMime(data.mime);
|
2023-03-08 21:31:19 -05:00
|
|
|
} else {
|
2024-06-26 09:16:24 -07:00
|
|
|
// guess that its html if it fully loaded as a page
|
|
|
|
|
data.isHTMLPage = !!fullLoadedResponse;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Full Page Loaded if:
|
|
|
|
|
// - it was a download response
|
|
|
|
|
// - page.load() succeeded
|
|
|
|
|
// but not:
|
|
|
|
|
// - if first response was received, but not fully loaded
|
|
|
|
|
if (fullLoadedResponse || downloadResponse) {
|
|
|
|
|
data.loadState = LoadState.FULL_PAGE_LOADED;
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!data.isHTMLPage) {
|
|
|
|
|
data.filteredFrames = [];
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
"Non-HTML Page URL, skipping all post-crawl actions",
|
|
|
|
|
{ isDownload: !!downloadResponse, mime: data.mime, ...logDetails },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
2022-03-22 17:41:51 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
// HTML Pages Only here
|
2024-09-06 16:24:18 -07:00
|
|
|
const frames = page.frames();
|
2024-06-26 09:16:24 -07:00
|
|
|
|
|
|
|
|
const filteredFrames = await Promise.allSettled(
|
|
|
|
|
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
data.filteredFrames = filteredFrames
|
|
|
|
|
.filter((x: PromiseSettledResult<Frame | null>) => {
|
|
|
|
|
if (x.status === "fulfilled") {
|
|
|
|
|
return !!x.value;
|
|
|
|
|
}
|
|
|
|
|
logger.warn("Error in iframe check", {
|
|
|
|
|
reason: x.reason,
|
|
|
|
|
...logDetails,
|
|
|
|
|
});
|
|
|
|
|
return false;
|
|
|
|
|
})
|
|
|
|
|
.map((x) => (x as PromiseFulfilledResult<Frame>).value);
|
|
|
|
|
|
|
|
|
|
//data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
|
|
|
|
|
|
2024-09-30 15:46:34 -07:00
|
|
|
const { seedId, extraHops } = data;
|
2024-02-28 11:31:59 -08:00
|
|
|
|
2024-06-13 17:18:06 -07:00
|
|
|
if (!seed) {
|
|
|
|
|
logger.error(
|
|
|
|
|
"Seed not found, likely invalid crawl state - skipping link extraction and behaviors",
|
|
|
|
|
{ seedId, ...logDetails },
|
|
|
|
|
);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2021-07-20 15:45:51 -07:00
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.checkCF(page, logDetails);
|
2022-03-18 10:32:59 -07:00
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.netIdle(page, logDetails);
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2025-11-19 15:49:49 -08:00
|
|
|
// allow failing crawl via script only within awaitPageLoad()
|
2025-07-08 13:08:52 -07:00
|
|
|
data.contentCheckAllowed = true;
|
|
|
|
|
|
2024-04-18 17:16:57 -07:00
|
|
|
await this.awaitPageLoad(page.mainFrame(), logDetails);
|
2024-03-28 17:17:29 -07:00
|
|
|
|
2025-07-08 13:08:52 -07:00
|
|
|
data.contentCheckAllowed = false;
|
|
|
|
|
|
2021-07-20 15:45:51 -07:00
|
|
|
// skip extraction if at max depth
|
2024-11-08 08:04:41 -08:00
|
|
|
if (seed.isAtMaxDepth(depth, extraHops)) {
|
|
|
|
|
logger.debug("Skipping Link Extraction, At Max Depth", {}, "links");
|
2021-07-20 15:45:51 -07:00
|
|
|
return;
|
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2024-11-08 08:04:41 -08:00
|
|
|
logger.debug(
|
|
|
|
|
"Extracting links",
|
|
|
|
|
{ selectors: this.params.selectLinks, ...logDetails },
|
|
|
|
|
"links",
|
|
|
|
|
);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2024-11-08 08:04:41 -08:00
|
|
|
await this.extractLinks(page, data, this.params.selectLinks, logDetails);
|
2021-07-23 18:31:43 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async netIdle(page: Page, details: LogDetails) {
|
2022-07-08 17:17:46 -07:00
|
|
|
if (!this.params.netIdleWait) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
// in case page starts loading via fetch/xhr immediately after page load,
|
|
|
|
|
// we want to ensure we don't exit too early
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(0.5);
|
2022-07-08 17:17:46 -07:00
|
|
|
|
|
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
await this.browser.waitForNetworkIdle(page, {
|
|
|
|
|
timeout: this.params.netIdleWait * 1000,
|
2025-11-18 16:34:02 -08:00
|
|
|
concurrency: this.params.netIdleMaxRequests,
|
2023-11-09 11:27:11 -08:00
|
|
|
});
|
2022-07-08 17:17:46 -07:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("waitForNetworkIdle timed out, ignoring", details);
|
2022-07-08 17:17:46 -07:00
|
|
|
// ignore, continue
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-18 17:16:57 -07:00
|
|
|
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
|
2025-03-31 12:02:25 -07:00
|
|
|
if (this.params.behaviorOpts) {
|
|
|
|
|
try {
|
|
|
|
|
await timedRun(
|
|
|
|
|
frame.evaluate(
|
|
|
|
|
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
|
|
|
|
|
),
|
2025-07-08 13:08:52 -07:00
|
|
|
PAGE_OP_TIMEOUT_SECS * 4,
|
2025-03-31 12:02:25 -07:00
|
|
|
"Custom page load check timed out",
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
logger.warn("Waiting for custom page load failed", e, "behavior");
|
|
|
|
|
}
|
2024-04-18 17:16:57 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.postLoadDelay) {
|
|
|
|
|
logger.info("Awaiting post load delay", {
|
|
|
|
|
seconds: this.params.postLoadDelay,
|
|
|
|
|
});
|
|
|
|
|
await sleep(this.params.postLoadDelay);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async extractLinks(
|
|
|
|
|
page: Page,
|
|
|
|
|
data: PageState,
|
2024-11-08 08:04:41 -08:00
|
|
|
selectors: ExtractSelector[],
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
|
|
|
|
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
2021-07-23 18:31:43 -07:00
|
|
|
|
2024-02-28 22:56:32 -08:00
|
|
|
callbacks.addLink = async (url: string) => {
|
2024-07-11 19:48:43 -07:00
|
|
|
await this.queueInScopeUrls(
|
|
|
|
|
seedId,
|
|
|
|
|
[url],
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
|
|
|
|
false,
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
};
|
|
|
|
|
|
2023-09-15 10:12:08 -07:00
|
|
|
const frames = filteredFrames || page.frames();
|
2023-03-08 21:31:19 -05:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
try {
|
2025-03-31 12:02:25 -07:00
|
|
|
for (const { selector, extract, attrOnly } of selectors) {
|
2024-10-11 00:11:24 -07:00
|
|
|
await Promise.allSettled(
|
|
|
|
|
frames.map((frame) => {
|
|
|
|
|
const getLinks = frame
|
2025-03-31 12:02:25 -07:00
|
|
|
.evaluate(
|
|
|
|
|
`self.__bx_behaviors.extractLinks(${JSON.stringify(
|
|
|
|
|
selector,
|
|
|
|
|
)}, ${JSON.stringify(extract)}, ${attrOnly})`,
|
|
|
|
|
)
|
2024-10-11 00:11:24 -07:00
|
|
|
.catch((e) =>
|
|
|
|
|
logger.warn("Link Extraction failed in frame", {
|
|
|
|
|
frameUrl: frame.url,
|
|
|
|
|
...logDetails,
|
|
|
|
|
...formatErr(e),
|
|
|
|
|
}),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
return timedRun(
|
|
|
|
|
getLinks,
|
2023-11-09 11:27:11 -08:00
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Link extraction timed out",
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2024-10-11 00:11:24 -07:00
|
|
|
);
|
|
|
|
|
}),
|
2023-09-15 10:12:08 -07:00
|
|
|
);
|
2021-07-20 15:45:51 -07:00
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
|
|
|
|
logger.warn("Link Extraction failed", e, "links");
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async queueInScopeUrls(
|
|
|
|
|
seedId: number,
|
|
|
|
|
urls: string[],
|
|
|
|
|
depth: number,
|
|
|
|
|
extraHops = 0,
|
2024-07-11 19:48:43 -07:00
|
|
|
noOOS = false,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails = {},
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
depth += 1;
|
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
// new number of extra hops, set if this hop is out-of-scope (oos)
|
|
|
|
|
const newExtraHops = extraHops + 1;
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
for (const possibleUrl of urls) {
|
2024-06-18 16:11:48 -07:00
|
|
|
const res = this.getScope(
|
2024-07-11 19:48:43 -07:00
|
|
|
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId, noOOS },
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-01-15 09:03:09 -08:00
|
|
|
|
|
|
|
|
if (!res) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const { url, isOOS } = res;
|
2022-01-15 09:03:09 -08:00
|
|
|
|
|
|
|
|
if (url) {
|
2023-11-09 11:27:11 -08:00
|
|
|
await this.queueUrl(
|
|
|
|
|
seedId,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
isOOS ? newExtraHops : extraHops,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
|
|
|
|
logger.error("Queuing Error", e, "links");
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async checkCF(page: Page, logDetails: LogDetails) {
|
2022-03-18 10:32:59 -07:00
|
|
|
try {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Check CF Blocking", logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
while (
|
|
|
|
|
await timedRun(
|
|
|
|
|
page.$("div.cf-browser-verification.cf-im-under-attack"),
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Cloudflare check timed out",
|
|
|
|
|
logDetails,
|
|
|
|
|
"general",
|
2023-11-09 19:11:11 -05:00
|
|
|
true,
|
2023-11-09 11:27:11 -08:00
|
|
|
)
|
|
|
|
|
) {
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Cloudflare Check Detected, waiting for reload...",
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(5.5);
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
//logger.warn("Check CF failed, ignoring");
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async queueUrl(
|
|
|
|
|
seedId: number,
|
|
|
|
|
url: string,
|
|
|
|
|
depth: number,
|
|
|
|
|
extraHops: number,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails = {},
|
2024-03-22 17:32:42 -07:00
|
|
|
ts = 0,
|
|
|
|
|
pageid?: string,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (this.limitHit) {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const result = await this.crawlState.addToQueue(
|
2024-03-22 17:32:42 -07:00
|
|
|
{ url, seedId, depth, extraHops, ts, pageid },
|
2023-11-09 19:11:11 -05:00
|
|
|
this.pageLimit,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-15 10:12:08 -07:00
|
|
|
|
|
|
|
|
switch (result) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case QueueState.ADDED:
|
2025-09-12 13:34:41 -07:00
|
|
|
logger.debug("Queued new page URL", { url, ...logDetails }, "links");
|
2023-11-09 19:11:11 -05:00
|
|
|
return true;
|
2023-09-15 10:12:08 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case QueueState.LIMIT_HIT:
|
2025-09-12 13:34:41 -07:00
|
|
|
logger.debug(
|
2025-07-24 23:52:12 -07:00
|
|
|
"Page URL not queued, at page limit",
|
2023-11-09 19:11:11 -05:00
|
|
|
{ url, ...logDetails },
|
2025-09-12 13:34:41 -07:00
|
|
|
"links",
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
2025-09-12 13:34:41 -07:00
|
|
|
if (!this.limitHit && depth === 0) {
|
|
|
|
|
logger.error(
|
|
|
|
|
"Page limit reached when adding URL list, some URLs not crawled.",
|
|
|
|
|
);
|
|
|
|
|
}
|
2023-11-09 19:11:11 -05:00
|
|
|
this.limitHit = true;
|
|
|
|
|
return false;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case QueueState.DUPE_URL:
|
2025-09-12 13:34:41 -07:00
|
|
|
logger.debug(
|
2025-07-24 23:52:12 -07:00
|
|
|
"Page URL not queued, already seen",
|
2023-11-09 19:11:11 -05:00
|
|
|
{ url, ...logDetails },
|
2025-09-12 13:34:41 -07:00
|
|
|
"links",
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
|
|
|
|
return false;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
|
|
2023-09-15 10:12:08 -07:00
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
async initPages(filename: string, title: string) {
|
|
|
|
|
let fh = null;
|
|
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2024-04-11 13:55:52 -07:00
|
|
|
await fsp.mkdir(this.pagesDir, { recursive: true });
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
const createNew = !fs.existsSync(filename);
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
fh = fs.createWriteStream(filename, { flags: "a" });
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
|
if (createNew) {
|
2023-11-09 11:27:11 -08:00
|
|
|
const header: Record<string, string> = {
|
|
|
|
|
format: "json-pages-1.0",
|
|
|
|
|
id: "pages",
|
2024-04-11 13:55:52 -07:00
|
|
|
title,
|
2023-11-09 11:27:11 -08:00
|
|
|
};
|
2024-09-05 18:10:27 -07:00
|
|
|
header.hasText = this.params.text.includes("to-pages") + "";
|
2023-10-31 23:05:30 -07:00
|
|
|
if (this.params.text.length) {
|
|
|
|
|
logger.debug("Text Extraction: " + this.params.text.join(","));
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
} else {
|
2023-10-31 23:05:30 -07:00
|
|
|
logger.debug("Text Extraction: None");
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2024-09-06 16:24:18 -07:00
|
|
|
fh.write(JSON.stringify(header) + "\n");
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (err) {
|
2024-04-11 13:55:52 -07:00
|
|
|
logger.error(`"${filename}" creation failed`, err);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2024-04-11 13:55:52 -07:00
|
|
|
return fh;
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
protected pageEntryForRedis(
|
|
|
|
|
entry: Record<string, string | number | boolean | object>,
|
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
|
|
|
state: PageState,
|
|
|
|
|
) {
|
|
|
|
|
return entry;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async writePage(state: PageState) {
|
|
|
|
|
const {
|
|
|
|
|
pageid,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
title,
|
|
|
|
|
text,
|
|
|
|
|
loadState,
|
|
|
|
|
mime,
|
|
|
|
|
favicon,
|
|
|
|
|
status,
|
|
|
|
|
} = state;
|
|
|
|
|
|
|
|
|
|
const row: PageEntry = { id: pageid, url, title, loadState };
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
let { ts } = state;
|
2024-03-22 13:37:14 -07:00
|
|
|
if (!ts) {
|
|
|
|
|
ts = new Date();
|
2024-06-07 10:34:19 -07:00
|
|
|
if (!this.params.dryRun) {
|
|
|
|
|
logger.warn(
|
|
|
|
|
"Page date missing, setting to now",
|
|
|
|
|
{ url, ts },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
|
|
|
|
}
|
2024-02-09 19:44:17 -05:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 13:37:14 -07:00
|
|
|
row.ts = ts.toISOString();
|
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
if (mime) {
|
|
|
|
|
row.mime = mime;
|
|
|
|
|
}
|
2021-02-23 16:52:54 -05:00
|
|
|
|
2024-02-28 22:56:12 -08:00
|
|
|
if (status) {
|
|
|
|
|
row.status = status;
|
|
|
|
|
}
|
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (depth === 0) {
|
|
|
|
|
row.seed = true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-27 10:01:20 -04:00
|
|
|
if (Number.isInteger(depth)) {
|
|
|
|
|
row.depth = depth;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 13:37:14 -07:00
|
|
|
if (favicon) {
|
2023-09-10 11:29:35 -07:00
|
|
|
row.favIconUrl = favicon;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-30 11:18:59 -08:00
|
|
|
if (this.params.writePagesToRedis) {
|
|
|
|
|
await this.crawlState.writeToPagesQueue(
|
|
|
|
|
this.pageEntryForRedis(row, state),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (text && this.textInPages) {
|
|
|
|
|
row.text = text;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const processedRow = JSON.stringify(row) + "\n";
|
2024-04-11 13:55:52 -07:00
|
|
|
|
|
|
|
|
const pagesFH = depth > 0 ? this.extraPagesFH : this.pagesFH;
|
|
|
|
|
|
|
|
|
|
if (!pagesFH) {
|
|
|
|
|
logger.error("Can't write pages, missing stream", {}, "pageStatus");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2024-09-06 16:24:18 -07:00
|
|
|
pagesFH.write(processedRow);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (err) {
|
2024-04-11 13:55:52 -07:00
|
|
|
logger.warn(
|
|
|
|
|
"Page append failed",
|
|
|
|
|
{ pagesFile: depth > 0 ? this.otherPagesFile : this.seedPagesFile },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
|
|
|
|
|
if (!sitemap) {
|
|
|
|
|
return;
|
2023-09-13 13:20:41 -04:00
|
|
|
}
|
|
|
|
|
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
if (await this.crawlState.isSitemapDone()) {
|
|
|
|
|
logger.info("Sitemap already processed, skipping", "sitemap");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-05 18:10:27 -07:00
|
|
|
const fromDate = this.params.sitemapFromDate
|
|
|
|
|
? new Date(this.params.sitemapFromDate)
|
|
|
|
|
: undefined;
|
|
|
|
|
const toDate = this.params.sitemapToDate
|
|
|
|
|
? new Date(this.params.sitemapToDate)
|
|
|
|
|
: undefined;
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
const headers = this.headers;
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
"Fetching sitemap",
|
|
|
|
|
{ from: fromDate || "<any date>", to: fromDate || "<any date>" },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
const sitemapper = new SitemapReader({
|
|
|
|
|
headers,
|
|
|
|
|
fromDate,
|
|
|
|
|
toDate,
|
|
|
|
|
limit: this.pageLimit,
|
2020-11-14 21:55:02 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
try {
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
await sitemapper.parse(sitemap, url);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
logger.warn(
|
|
|
|
|
"Sitemap for seed failed",
|
|
|
|
|
{ url, sitemap, ...formatErr(e) },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
return;
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
|
|
|
|
|
let power = 1;
|
|
|
|
|
let resolved = false;
|
|
|
|
|
|
|
|
|
|
let finished = false;
|
|
|
|
|
|
|
|
|
|
await new Promise<void>((resolve) => {
|
|
|
|
|
sitemapper.on("end", () => {
|
|
|
|
|
resolve();
|
|
|
|
|
if (!finished) {
|
|
|
|
|
logger.info(
|
|
|
|
|
"Sitemap Parsing Finished",
|
|
|
|
|
{ urlsFound: sitemapper.count, limitHit: sitemapper.atLimit() },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
2024-09-06 16:24:18 -07:00
|
|
|
this.crawlState
|
|
|
|
|
.markSitemapDone()
|
|
|
|
|
.catch((e) => logger.warn("Error marking sitemap done", e));
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
finished = true;
|
|
|
|
|
}
|
|
|
|
|
});
|
2024-03-26 14:50:36 -07:00
|
|
|
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
sitemapper.on("url", ({ url }) => {
|
|
|
|
|
const count = sitemapper.count;
|
|
|
|
|
if (count % 10 ** power === 0) {
|
|
|
|
|
if (count % 10 ** (power + 1) === 0 && power <= 3) {
|
|
|
|
|
power++;
|
|
|
|
|
}
|
|
|
|
|
const sitemapsQueued = sitemapper.getSitemapsQueued();
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Sitemap URLs processed so far",
|
|
|
|
|
{ count, sitemapsQueued },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
}
|
2024-09-06 16:24:18 -07:00
|
|
|
this.queueInScopeUrls(seedId, [url], 0, 0, true).catch((e) =>
|
|
|
|
|
logger.warn("Error queuing urls", e, "links"),
|
|
|
|
|
);
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
if (count >= 100 && !resolved) {
|
|
|
|
|
logger.info(
|
|
|
|
|
"Sitemap partially parsed, continue parsing large sitemap in the background",
|
|
|
|
|
{ urlsFound: count },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
resolve();
|
|
|
|
|
resolved = true;
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
});
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
async combineWARC() {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating Combined WARCs");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-warc");
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
// Get the list of created Warcs
|
2024-03-22 17:32:42 -07:00
|
|
|
const warcLists = await fsp.readdir(this.archivesDir);
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Combining ${warcLists.length} WARCs...`);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
|
|
|
|
|
|
|
|
|
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
|
|
|
|
|
for (let i = 0; i < warcLists.length; i++) {
|
2024-03-22 17:32:42 -07:00
|
|
|
const fileName = path.join(this.archivesDir, warcLists[i]);
|
2022-02-08 15:31:55 -08:00
|
|
|
const fileSize = await getFileSize(fileName);
|
2023-11-09 11:27:11 -08:00
|
|
|
fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const generatedCombinedWarcs = [];
|
|
|
|
|
|
|
|
|
|
// Used to name combined warcs, default to -1 for first increment
|
|
|
|
|
let combinedWarcNumber = -1;
|
|
|
|
|
|
|
|
|
|
// write combine WARC to collection root
|
|
|
|
|
let combinedWarcFullPath = "";
|
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// fileHandler
|
|
|
|
|
let fh = null;
|
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
// Iterate through the sorted file size array.
|
|
|
|
|
for (let j = 0; j < fileSizeObjects.length; j++) {
|
|
|
|
|
// if need to rollover to new warc
|
|
|
|
|
let doRollover = false;
|
|
|
|
|
|
|
|
|
|
// set to true for first warc
|
|
|
|
|
if (combinedWarcNumber < 0) {
|
|
|
|
|
doRollover = true;
|
|
|
|
|
} else {
|
|
|
|
|
// Check the size of the existing combined warc.
|
2022-02-08 15:31:55 -08:00
|
|
|
const currentCombinedWarcSize = await getFileSize(combinedWarcFullPath);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
// If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
|
2023-11-09 11:27:11 -08:00
|
|
|
const proposedWarcSize =
|
|
|
|
|
fileSizeObjects[j].fileSize + currentCombinedWarcSize;
|
2021-03-31 13:41:27 -04:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
doRollover = proposedWarcSize >= this.params.rolloverSize;
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (doRollover) {
|
2021-06-23 19:36:32 -07:00
|
|
|
// If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
|
2021-03-31 13:41:27 -04:00
|
|
|
// 1. increment the combinedWarcNumber
|
|
|
|
|
// 2. create the name of the new combinedWarcFile
|
|
|
|
|
// 3. Write the header out to the new file
|
|
|
|
|
// 4. Write out the current warc data to the combinedFile
|
|
|
|
|
combinedWarcNumber = combinedWarcNumber + 1;
|
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
const combinedWarcName = `${this.params.collection}_${combinedWarcNumber}.warc.gz`;
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
// write combined warcs to root collection dir as they're output of a collection (like wacz)
|
|
|
|
|
combinedWarcFullPath = path.join(this.collDir, combinedWarcName);
|
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
if (fh) {
|
|
|
|
|
fh.end();
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
fh = fs.createWriteStream(combinedWarcFullPath, { flags: "a" });
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
generatedCombinedWarcs.push(combinedWarcName);
|
|
|
|
|
|
2024-05-22 15:47:05 -07:00
|
|
|
const warcBuffer = await createWARCInfo(combinedWarcName);
|
2021-04-29 14:34:56 -07:00
|
|
|
fh.write(warcBuffer);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
|
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const p = new Promise<void>((resolve) => {
|
2021-04-29 14:34:56 -07:00
|
|
|
reader.on("end", () => resolve());
|
|
|
|
|
});
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (fh) {
|
|
|
|
|
reader.pipe(fh, { end: false });
|
|
|
|
|
}
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
|
await p;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (fh) {
|
2024-09-06 16:24:18 -07:00
|
|
|
fh.end();
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
async serializeConfig(done = false) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
switch (this.params.saveState) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case "never":
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case "partial":
|
|
|
|
|
if (!done) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (await this.crawlState.isFinished()) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case "always":
|
|
|
|
|
default:
|
|
|
|
|
break;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const now = new Date();
|
|
|
|
|
|
|
|
|
|
if (!done) {
|
|
|
|
|
// if not done, save state only after specified interval has elapsed
|
2023-11-09 11:27:11 -08:00
|
|
|
if (
|
|
|
|
|
secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval
|
|
|
|
|
) {
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.lastSaveTime = now.getTime();
|
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const crawlDir = path.join(this.collDir, "crawls");
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
await fsp.mkdir(crawlDir, { recursive: true });
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2025-04-01 13:40:03 -07:00
|
|
|
const filenameOnly = `${interpolateFilename("@ts-@id", this.crawlId)}.yaml`;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
|
|
|
|
const filename = path.join(crawlDir, filenameOnly);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
|
const state = await this.crawlState.serialize();
|
|
|
|
|
|
|
|
|
|
if (this.origConfig) {
|
|
|
|
|
this.origConfig.state = state;
|
|
|
|
|
}
|
2025-01-25 22:55:49 -08:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
try {
|
2025-01-25 22:55:49 -08:00
|
|
|
const res = yaml.dump(this.origConfig, { lineWidth: -1 });
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Saving crawl state to: ${filename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
await fsp.writeFile(filename, res);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Failed to write save state file: ${filename}`, e);
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-01 18:59:04 -07:00
|
|
|
if (!this.saveStateFiles.includes(filename)) {
|
|
|
|
|
this.saveStateFiles.push(filename);
|
|
|
|
|
}
|
2022-03-14 10:41:56 -07:00
|
|
|
|
|
|
|
|
if (this.saveStateFiles.length > this.params.saveStateHistory) {
|
|
|
|
|
const oldFilename = this.saveStateFiles.shift();
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Removing old save-state: ${oldFilename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
await fsp.unlink(oldFilename || "");
|
2022-03-14 10:41:56 -07:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Failed to delete old save state file: ${oldFilename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
}
|
|
|
|
|
}
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
|
|
|
|
if (this.storage && done && this.params.saveState === "always") {
|
2025-04-01 18:59:04 -07:00
|
|
|
await this.storage.uploadFile(filename, filenameOnly);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
getWarcPrefix(defaultValue = "") {
|
|
|
|
|
let warcPrefix =
|
|
|
|
|
process.env.WARC_PREFIX || this.params.warcPrefix || defaultValue;
|
|
|
|
|
|
|
|
|
|
if (warcPrefix) {
|
|
|
|
|
warcPrefix += "-" + this.crawlId + "-";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return warcPrefix;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
createExtraResourceWarcWriter(resourceName: string, gzip = true) {
|
2024-04-15 13:43:08 -07:00
|
|
|
const filenameBase = `${this.getWarcPrefix()}${resourceName}-$ts`;
|
2024-03-26 14:54:27 -07:00
|
|
|
|
|
|
|
|
return this.createWarcWriter(filenameBase, gzip, { resourceName });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
createWarcWriter(
|
|
|
|
|
filenameBase: string,
|
|
|
|
|
gzip: boolean,
|
|
|
|
|
logDetails: Record<string, string>,
|
|
|
|
|
) {
|
|
|
|
|
const filenameTemplate = `${filenameBase}.warc${gzip ? ".gz" : ""}`;
|
2025-04-03 02:10:50 +02:00
|
|
|
const useSHA1 = this.params.useSHA1;
|
2024-03-26 14:54:27 -07:00
|
|
|
|
|
|
|
|
return new WARCWriter({
|
|
|
|
|
archivesDir: this.archivesDir,
|
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674
This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-08-29 13:21:20 -07:00
|
|
|
warcCdxDir: this.warcCdxDir,
|
2024-03-26 14:54:27 -07:00
|
|
|
filenameTemplate,
|
|
|
|
|
rolloverSize: this.params.rolloverSize,
|
|
|
|
|
gzip,
|
2025-04-03 02:10:50 +02:00
|
|
|
useSHA1,
|
2024-03-26 14:54:27 -07:00
|
|
|
logDetails,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
createRecorder(id: number): Recorder | null {
|
|
|
|
|
if (!this.recording) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
const filenameBase = `${this.getWarcPrefix("rec")}$ts-${id}`;
|
|
|
|
|
|
|
|
|
|
const writer = this.createWarcWriter(filenameBase, true, {
|
|
|
|
|
id: id.toString(),
|
|
|
|
|
});
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
const res = new Recorder({
|
|
|
|
|
workerid: id,
|
|
|
|
|
crawler: this,
|
2024-03-26 14:54:27 -07:00
|
|
|
writer,
|
2024-03-22 17:32:42 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
this.browser.recorders.push(res);
|
|
|
|
|
return res;
|
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
function getDownloadResponse(req: HTTPRequest) {
|
2022-03-14 14:41:39 -07:00
|
|
|
try {
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!req.isNavigationRequest()) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const failure = req.failure();
|
2023-11-09 19:11:11 -05:00
|
|
|
const failureText = (failure && failure.errorText) || "";
|
|
|
|
|
if (
|
|
|
|
|
failureText !== "net::ERR_ABORTED" ||
|
|
|
|
|
req.resourceType() !== "document"
|
|
|
|
|
) {
|
2024-06-26 09:16:24 -07:00
|
|
|
return null;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const resp = req.response();
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!resp) {
|
|
|
|
|
return null;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const headers = resp.headers();
|
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
|
headers["content-disposition"] ||
|
2024-06-26 09:16:24 -07:00
|
|
|
(headers["content-type"] && !isHTMLMime(headers["content-type"]))
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
2024-06-26 09:16:24 -07:00
|
|
|
return resp;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
|
|
|
|
} catch (e) {
|
2024-06-26 09:16:24 -07:00
|
|
|
// ignore
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
return null;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|