2023-11-09 11:27:11 -08:00
|
|
|
import child_process, { ChildProcess, StdioOptions } from "child_process";
|
2022-10-24 15:30:10 +02:00
|
|
|
import path from "path";
|
2023-11-09 11:27:11 -08:00
|
|
|
import fs, { WriteStream } from "fs";
|
2022-10-24 15:30:10 +02:00
|
|
|
import os from "os";
|
2024-04-11 13:55:52 -07:00
|
|
|
import fsp from "fs/promises";
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
import {
|
|
|
|
|
RedisCrawlState,
|
|
|
|
|
LoadState,
|
|
|
|
|
QueueState,
|
|
|
|
|
PageState,
|
|
|
|
|
WorkerId,
|
|
|
|
|
} from "./util/state.js";
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
import { parseArgs } from "./util/argParser.js";
|
|
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
import yaml from "js-yaml";
|
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
import { HealthChecker } from "./util/healthcheck.js";
|
2023-10-31 23:05:30 -07:00
|
|
|
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
2023-11-09 19:11:11 -05:00
|
|
|
import {
|
|
|
|
|
initStorage,
|
|
|
|
|
getFileSize,
|
|
|
|
|
getDirSize,
|
|
|
|
|
interpolateFilename,
|
|
|
|
|
checkDiskUtilization,
|
|
|
|
|
S3StorageSync,
|
|
|
|
|
} from "./util/storage.js";
|
2023-11-09 11:27:11 -08:00
|
|
|
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
2022-12-21 12:06:13 -05:00
|
|
|
import { Screenshots } from "./util/screenshots.js";
|
2022-10-24 15:30:10 +02:00
|
|
|
import { initRedis } from "./util/redis.js";
|
2024-04-12 14:31:07 -07:00
|
|
|
import { logger, formatErr, LogDetails } from "./util/logger.js";
|
2024-03-21 13:56:05 -07:00
|
|
|
import {
|
|
|
|
|
WorkerOpts,
|
|
|
|
|
WorkerState,
|
|
|
|
|
closeWorkers,
|
|
|
|
|
runWorkers,
|
|
|
|
|
} from "./util/worker.js";
|
2023-03-22 14:50:18 -04:00
|
|
|
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
2024-03-18 14:24:48 -07:00
|
|
|
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
|
2020-11-01 19:22:53 -08:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
import { Browser } from "./util/browser.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
import {
|
|
|
|
|
ADD_LINK_FUNC,
|
|
|
|
|
BEHAVIOR_LOG_FUNC,
|
|
|
|
|
DEFAULT_SELECTORS,
|
2024-06-25 13:53:43 -07:00
|
|
|
DISPLAY,
|
2023-11-09 19:11:11 -05:00
|
|
|
} from "./util/constants.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-10-25 10:53:32 -04:00
|
|
|
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
2023-04-19 19:17:15 -07:00
|
|
|
import { OriginOverride } from "./util/originoverride.js";
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
import {
|
|
|
|
|
CDPSession,
|
|
|
|
|
Frame,
|
|
|
|
|
HTTPRequest,
|
|
|
|
|
HTTPResponse,
|
|
|
|
|
Page,
|
|
|
|
|
Protocol,
|
|
|
|
|
} from "puppeteer-core";
|
2024-03-22 17:32:42 -07:00
|
|
|
import { Recorder } from "./util/recorder.js";
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
import { SitemapReader } from "./util/sitemapper.js";
|
|
|
|
|
import { ScopedSeed } from "./util/seeds.js";
|
2024-05-22 15:47:05 -07:00
|
|
|
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
2024-06-26 09:16:24 -07:00
|
|
|
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
|
2024-06-10 13:11:00 -07:00
|
|
|
import { initProxy } from "./util/proxy.js";
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const behaviors = fs.readFileSync(
|
|
|
|
|
new URL(
|
|
|
|
|
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
|
|
|
|
import.meta.url,
|
|
|
|
|
),
|
|
|
|
|
{ encoding: "utf8" },
|
|
|
|
|
);
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-03-10 23:11:24 -05:00
|
|
|
const FETCH_TIMEOUT_SECS = 30;
|
|
|
|
|
const PAGE_OP_TIMEOUT_SECS = 5;
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2024-03-21 08:16:59 -07:00
|
|
|
const RUN_DETACHED = process.env.DETACHED_CHILD_PROC == "1";
|
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const POST_CRAWL_STATES = [
|
|
|
|
|
"generate-wacz",
|
|
|
|
|
"uploading-wacz",
|
|
|
|
|
"generate-cdx",
|
|
|
|
|
"generate-warc",
|
|
|
|
|
];
|
2023-05-03 16:25:59 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
type PageEntry = {
|
|
|
|
|
id: string;
|
|
|
|
|
url: string;
|
|
|
|
|
title?: string;
|
|
|
|
|
loadState?: number;
|
|
|
|
|
mime?: string;
|
|
|
|
|
seed?: boolean;
|
|
|
|
|
text?: string;
|
|
|
|
|
favIconUrl?: string;
|
2024-02-09 19:44:17 -05:00
|
|
|
ts?: string;
|
2024-02-28 22:56:12 -08:00
|
|
|
status?: number;
|
2023-11-09 11:27:11 -08:00
|
|
|
};
|
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// ============================================================================
|
2022-10-24 15:30:10 +02:00
|
|
|
export class Crawler {
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
params: any;
|
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
origConfig: any;
|
|
|
|
|
|
|
|
|
|
collDir: string;
|
|
|
|
|
logDir: string;
|
|
|
|
|
logFilename: string;
|
|
|
|
|
|
|
|
|
|
headers: Record<string, string> = {};
|
|
|
|
|
|
|
|
|
|
crawlState!: RedisCrawlState;
|
|
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
pagesFH?: WriteStream | null = null;
|
|
|
|
|
extraPagesFH?: WriteStream | null = null;
|
2023-11-09 11:27:11 -08:00
|
|
|
logFH!: WriteStream;
|
|
|
|
|
|
|
|
|
|
crawlId: string;
|
|
|
|
|
|
|
|
|
|
startTime: number;
|
|
|
|
|
|
|
|
|
|
limitHit = false;
|
|
|
|
|
pageLimit: number;
|
|
|
|
|
|
|
|
|
|
saveStateFiles: string[] = [];
|
|
|
|
|
lastSaveTime: number;
|
|
|
|
|
|
|
|
|
|
maxPageTime: number;
|
|
|
|
|
|
2024-06-13 17:18:06 -07:00
|
|
|
seeds: ScopedSeed[];
|
|
|
|
|
numOriginalSeeds = 0;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
emulateDevice: any = {};
|
|
|
|
|
|
|
|
|
|
captureBasePrefix = "";
|
|
|
|
|
|
|
|
|
|
infoString!: string;
|
|
|
|
|
|
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
gotoOpts: Record<string, any>;
|
|
|
|
|
|
|
|
|
|
pagesDir: string;
|
2024-04-11 13:55:52 -07:00
|
|
|
seedPagesFile: string;
|
|
|
|
|
otherPagesFile: string;
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
archivesDir: string;
|
2024-03-26 14:54:27 -07:00
|
|
|
tempdir: string;
|
|
|
|
|
tempCdxDir: string;
|
|
|
|
|
|
|
|
|
|
screenshotWriter: WARCWriter | null;
|
|
|
|
|
textWriter: WARCWriter | null;
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
blockRules: BlockRules | null;
|
|
|
|
|
adBlockRules: AdBlockRules | null;
|
|
|
|
|
|
|
|
|
|
healthChecker: HealthChecker | null = null;
|
|
|
|
|
originOverride: OriginOverride | null = null;
|
|
|
|
|
|
|
|
|
|
screencaster: ScreenCaster | null = null;
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
skipTextDocs = 0;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
interrupted = false;
|
|
|
|
|
finalExit = false;
|
|
|
|
|
uploadAndDeleteLocal = false;
|
|
|
|
|
done = false;
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
textInPages = false;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
customBehaviors = "";
|
2023-12-13 12:14:53 -08:00
|
|
|
behaviorsChecked = false;
|
2023-11-09 11:27:11 -08:00
|
|
|
behaviorLastLine?: string;
|
|
|
|
|
|
|
|
|
|
browser: Browser;
|
|
|
|
|
storage: S3StorageSync | null = null;
|
|
|
|
|
|
|
|
|
|
maxHeapUsed = 0;
|
|
|
|
|
maxHeapTotal = 0;
|
|
|
|
|
|
2024-06-10 13:11:00 -07:00
|
|
|
proxyServer?: string;
|
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
driver!: (opts: {
|
|
|
|
|
page: Page;
|
|
|
|
|
data: PageState;
|
|
|
|
|
// eslint-disable-next-line no-use-before-define
|
|
|
|
|
crawler: Crawler;
|
|
|
|
|
}) => NonNullable<unknown>;
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
recording: boolean;
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
constructor() {
|
2024-03-22 17:32:42 -07:00
|
|
|
const args = this.parseArgs();
|
|
|
|
|
this.params = args.parsed;
|
|
|
|
|
this.origConfig = args.origConfig;
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
// root collections dir
|
2023-11-09 11:27:11 -08:00
|
|
|
this.collDir = path.join(
|
|
|
|
|
this.params.cwd,
|
|
|
|
|
"collections",
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.collection,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-02-24 18:31:08 -08:00
|
|
|
this.logDir = path.join(this.collDir, "logs");
|
2023-11-09 11:27:11 -08:00
|
|
|
this.logFilename = path.join(
|
|
|
|
|
this.logDir,
|
2023-11-09 19:11:11 -05:00
|
|
|
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-02-24 18:31:08 -08:00
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
const debugLogging = this.params.logging.includes("debug");
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.setDebugLogging(debugLogging);
|
2023-04-01 13:07:59 -04:00
|
|
|
logger.setLogLevel(this.params.logLevel);
|
2024-03-07 08:35:53 -08:00
|
|
|
logger.setContext(this.params.logContext);
|
|
|
|
|
logger.setExcludeContext(this.params.logExcludeContext);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-10-09 12:28:58 -07:00
|
|
|
// if automatically restarts on error exit code,
|
|
|
|
|
// exit with 0 from fatal by default, to avoid unnecessary restart
|
|
|
|
|
// otherwise, exit with default fatal exit code
|
|
|
|
|
if (this.params.restartsOnError) {
|
|
|
|
|
logger.setDefaultFatalExitCode(0);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-14 21:54:40 -08:00
|
|
|
logger.debug("Writing log to: " + this.logFilename, {}, "general");
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
this.recording = !this.params.dryRun;
|
|
|
|
|
if (this.params.dryRun) {
|
|
|
|
|
logger.warn(
|
|
|
|
|
"Dry run mode: no archived data stored, only pages and logging. Storage and archive creation related options will be ignored.",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-03 17:16:29 +00:00
|
|
|
this.headers = {};
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// pages file
|
|
|
|
|
this.pagesFH = null;
|
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
this.crawlId = process.env.CRAWL_ID || os.hostname();
|
|
|
|
|
|
|
|
|
|
this.startTime = Date.now();
|
|
|
|
|
|
2021-01-29 18:26:55 +00:00
|
|
|
// was the limit hit?
|
|
|
|
|
this.limitHit = false;
|
2023-04-03 11:10:47 -07:00
|
|
|
this.pageLimit = this.params.pageLimit;
|
|
|
|
|
|
|
|
|
|
// resolve maxPageLimit and ensure pageLimit is no greater than maxPageLimit
|
|
|
|
|
if (this.params.maxPageLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.pageLimit = this.pageLimit
|
|
|
|
|
? Math.min(this.pageLimit, this.params.maxPageLimit)
|
|
|
|
|
: this.params.maxPageLimit;
|
2023-04-03 11:10:47 -07:00
|
|
|
}
|
2021-01-29 18:26:55 +00:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
this.saveStateFiles = [];
|
|
|
|
|
this.lastSaveTime = 0;
|
2023-03-22 14:50:18 -04:00
|
|
|
|
2024-06-13 17:18:06 -07:00
|
|
|
this.seeds = this.params.scopedSeeds as ScopedSeed[];
|
|
|
|
|
this.numOriginalSeeds = this.seeds.length;
|
|
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
// sum of page load + behavior timeouts + 2 x fetch + cloudflare + link extraction timeouts + extra page delay
|
|
|
|
|
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
|
2023-11-09 11:27:11 -08:00
|
|
|
this.maxPageTime =
|
|
|
|
|
this.params.pageLoadTimeout +
|
|
|
|
|
this.params.behaviorTimeout +
|
|
|
|
|
FETCH_TIMEOUT_SECS * 2 +
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS * 2 +
|
|
|
|
|
this.params.pageExtraDelay;
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice = this.params.emulateDevice || {};
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
//this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
|
|
|
|
//this.capturePrefix = "";//process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
|
2023-11-09 11:27:11 -08:00
|
|
|
//this.captureBasePrefix = "";
|
2021-02-04 00:28:32 -05:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
this.gotoOpts = {
|
|
|
|
|
waitUntil: this.params.waitUntil,
|
2023-11-09 11:27:11 -08:00
|
|
|
timeout: this.params.pageLoadTimeout * 1000,
|
2021-05-21 15:37:02 -07:00
|
|
|
};
|
2021-02-04 00:28:32 -05:00
|
|
|
|
|
|
|
|
// pages directory
|
|
|
|
|
this.pagesDir = path.join(this.collDir, "pages");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
// pages file
|
2024-04-11 13:55:52 -07:00
|
|
|
this.seedPagesFile = path.join(this.pagesDir, "pages.jsonl");
|
|
|
|
|
this.otherPagesFile = path.join(this.pagesDir, "extraPages.jsonl");
|
2021-07-19 15:49:43 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
// archives dir
|
|
|
|
|
this.archivesDir = path.join(this.collDir, "archive");
|
2024-03-26 14:54:27 -07:00
|
|
|
this.tempdir = path.join(os.tmpdir(), "tmp-dl");
|
|
|
|
|
this.tempCdxDir = path.join(this.collDir, "tmp-cdx");
|
|
|
|
|
|
|
|
|
|
this.screenshotWriter = null;
|
|
|
|
|
this.textWriter = null;
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
this.blockRules = null;
|
2022-10-25 10:53:32 -04:00
|
|
|
this.adBlockRules = null;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
this.healthChecker = null;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
this.interrupted = false;
|
|
|
|
|
this.finalExit = false;
|
2023-08-01 00:04:10 -07:00
|
|
|
this.uploadAndDeleteLocal = false;
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
this.textInPages = this.params.text.includes("to-pages");
|
|
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
this.done = false;
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
this.customBehaviors = "";
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
|
|
|
|
this.browser = new Browser();
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
protected parseArgs() {
|
|
|
|
|
return parseArgs();
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
configureUA() {
|
|
|
|
|
// override userAgent
|
|
|
|
|
if (this.params.userAgent) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice.userAgent = this.params.userAgent;
|
2023-10-25 18:32:10 +02:00
|
|
|
return this.params.userAgent;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if device set, it overrides the default Chrome UA
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (!this.emulateDevice.userAgent) {
|
|
|
|
|
this.emulateDevice.userAgent = this.browser.getDefaultUA();
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// suffix to append to default userAgent
|
|
|
|
|
if (this.params.userAgentSuffix) {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
|
|
|
|
return this.emulateDevice.userAgent;
|
2020-11-14 19:32:31 +00:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
async initCrawlState() {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (!redisUrl.startsWith("redis://")) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.fatal(
|
2023-11-09 19:11:11 -05:00
|
|
|
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
let redis;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
while (true) {
|
|
|
|
|
try {
|
|
|
|
|
redis = await initRedis(redisUrl);
|
|
|
|
|
break;
|
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
//logger.fatal("Unable to connect to state store Redis: " + redisUrl);
|
|
|
|
|
logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
|
2023-09-06 11:14:18 -04:00
|
|
|
await sleep(1);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
|
|
|
|
|
{},
|
2023-11-09 19:11:11 -05:00
|
|
|
"state",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
this.crawlState = new RedisCrawlState(
|
|
|
|
|
redis,
|
|
|
|
|
this.params.crawlId,
|
|
|
|
|
this.maxPageTime,
|
2023-11-09 19:11:11 -05:00
|
|
|
os.hostname(),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
// load full state from config
|
|
|
|
|
if (this.params.state) {
|
2024-06-13 17:18:06 -07:00
|
|
|
await this.crawlState.load(this.params.state, this.seeds, true);
|
2024-03-15 20:54:43 -04:00
|
|
|
// otherwise, just load extra seeds
|
|
|
|
|
} else {
|
|
|
|
|
await this.loadExtraSeeds();
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-30 21:29:41 -07:00
|
|
|
// clear any pending URLs from this instance
|
|
|
|
|
await this.crawlState.clearOwnPendingLocks();
|
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`,
|
|
|
|
|
{},
|
2023-11-09 19:11:11 -05:00
|
|
|
"state",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-03-14 10:41:56 -07:00
|
|
|
}
|
|
|
|
|
|
2023-04-11 11:32:52 -04:00
|
|
|
if (this.params.logErrorsToRedis) {
|
|
|
|
|
logger.setLogErrorsToRedis(true);
|
|
|
|
|
logger.setCrawlState(this.crawlState);
|
|
|
|
|
}
|
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
return this.crawlState;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
async loadExtraSeeds() {
|
|
|
|
|
const extraSeeds = await this.crawlState.getExtraSeeds();
|
|
|
|
|
|
|
|
|
|
for (const { origSeedId, newUrl } of extraSeeds) {
|
2024-06-13 17:18:06 -07:00
|
|
|
const seed = this.seeds[origSeedId];
|
|
|
|
|
this.seeds.push(seed.newScopedSeed(newUrl));
|
2024-03-15 20:54:43 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
initScreenCaster() {
|
|
|
|
|
let transport;
|
|
|
|
|
|
|
|
|
|
if (this.params.screencastPort) {
|
|
|
|
|
transport = new WSTransport(this.params.screencastPort);
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
`Screencast server started on: ${this.params.screencastPort}`,
|
|
|
|
|
{},
|
2023-11-09 19:11:11 -05:00
|
|
|
"screencast",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
// } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
|
|
|
|
// transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
|
|
|
|
|
// logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
|
|
|
|
|
// }
|
2022-02-23 12:09:48 -08:00
|
|
|
|
|
|
|
|
if (!transport) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
return new ScreenCaster(transport, this.params.workers);
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
launchRedis() {
|
2023-11-09 11:27:11 -08:00
|
|
|
let redisStdio: StdioOptions;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
|
if (this.params.logging.includes("redis")) {
|
|
|
|
|
const redisStderr = fs.openSync(path.join(this.logDir, "redis.log"), "a");
|
|
|
|
|
redisStdio = [process.stdin, redisStderr, redisStderr];
|
|
|
|
|
} else {
|
|
|
|
|
redisStdio = "ignore";
|
|
|
|
|
}
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
let redisArgs: string[] = [];
|
2023-11-07 21:38:50 -08:00
|
|
|
if (this.params.debugAccessRedis) {
|
|
|
|
|
redisArgs = ["--protected-mode", "no"];
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
return child_process.spawn("redis-server", redisArgs, {
|
|
|
|
|
cwd: "/tmp/",
|
|
|
|
|
stdio: redisStdio,
|
2024-03-21 08:16:59 -07:00
|
|
|
detached: RUN_DETACHED,
|
2023-11-09 11:27:11 -08:00
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async bootstrap() {
|
2023-11-09 11:27:11 -08:00
|
|
|
const subprocesses: ChildProcess[] = [];
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-06-10 13:11:00 -07:00
|
|
|
this.proxyServer = initProxy(this.params.proxyServer);
|
|
|
|
|
|
2024-07-23 18:50:26 -07:00
|
|
|
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
redisUrl.startsWith("redis://localhost:") ||
|
|
|
|
|
redisUrl.startsWith("redis://127.0.0.1:")
|
|
|
|
|
) {
|
|
|
|
|
subprocesses.push(this.launchRedis());
|
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
await fsp.mkdir(this.logDir, { recursive: true });
|
2024-06-07 10:34:19 -07:00
|
|
|
|
|
|
|
|
if (!this.params.dryRun) {
|
|
|
|
|
await fsp.mkdir(this.archivesDir, { recursive: true });
|
|
|
|
|
await fsp.mkdir(this.tempdir, { recursive: true });
|
|
|
|
|
await fsp.mkdir(this.tempCdxDir, { recursive: true });
|
|
|
|
|
}
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2024-04-15 13:43:08 -07:00
|
|
|
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.setExternalLogStream(this.logFH);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2024-03-18 14:24:48 -07:00
|
|
|
this.infoString = await getInfoString();
|
2024-05-22 15:47:05 -07:00
|
|
|
setWARCInfo(this.infoString, this.params.warcInfo);
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(this.infoString);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2024-06-13 17:18:06 -07:00
|
|
|
logger.info("Seeds", this.seeds);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2024-08-27 16:20:19 -04:00
|
|
|
if (this.params.behaviorOpts) {
|
|
|
|
|
logger.info("Behavior Options", this.params.behaviorOpts);
|
|
|
|
|
} else {
|
|
|
|
|
logger.info("Behaviors disabled");
|
|
|
|
|
}
|
2024-07-23 18:50:26 -07:00
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
if (this.params.profile) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info("With Browser Profile", { url: this.params.profile });
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
|
2023-02-03 00:02:47 -05:00
|
|
|
if (this.params.overwrite) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Clearing ${this.collDir} before starting`);
|
2023-02-03 00:02:47 -05:00
|
|
|
try {
|
|
|
|
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Unable to clear ${this.collDir}`, e);
|
2023-02-03 00:02:47 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
if (this.params.customBehaviors) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.customBehaviors = this.loadCustomBehaviors(
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.customBehaviors,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
this.headers = { "User-Agent": this.configureUA() };
|
2020-11-03 17:16:29 +00:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
process.on("exit", () => {
|
|
|
|
|
for (const proc of subprocesses) {
|
|
|
|
|
proc.kill();
|
|
|
|
|
}
|
|
|
|
|
});
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2024-06-20 20:10:25 -07:00
|
|
|
if (this.params.debugAccessBrowser) {
|
|
|
|
|
child_process.spawn(
|
|
|
|
|
"socat",
|
|
|
|
|
["tcp-listen:9222,reuseaddr,fork", "tcp:localhost:9221"],
|
|
|
|
|
{ detached: RUN_DETACHED },
|
|
|
|
|
);
|
|
|
|
|
}
|
2022-08-21 00:30:25 -07:00
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (!this.params.headless && !process.env.NO_XVFB) {
|
2024-03-21 08:16:59 -07:00
|
|
|
child_process.spawn(
|
|
|
|
|
"Xvfb",
|
|
|
|
|
[
|
2024-06-25 13:53:43 -07:00
|
|
|
DISPLAY,
|
2024-03-21 08:16:59 -07:00
|
|
|
"-listen",
|
|
|
|
|
"tcp",
|
|
|
|
|
"-screen",
|
|
|
|
|
"0",
|
|
|
|
|
process.env.GEOMETRY || "",
|
|
|
|
|
"-ac",
|
|
|
|
|
"+extension",
|
|
|
|
|
"RANDR",
|
|
|
|
|
],
|
|
|
|
|
{ detached: RUN_DETACHED },
|
|
|
|
|
);
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2024-03-26 14:54:27 -07:00
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
if (this.params.screenshot && !this.params.dryRun) {
|
2024-03-26 14:54:27 -07:00
|
|
|
this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots");
|
|
|
|
|
}
|
2024-06-07 10:34:19 -07:00
|
|
|
if (this.params.text && !this.params.dryRun) {
|
2024-03-26 14:54:27 -07:00
|
|
|
this.textWriter = this.createExtraResourceWarcWriter("text");
|
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
|
|
|
|
|
2022-11-21 11:59:37 -08:00
|
|
|
extraChromeArgs() {
|
|
|
|
|
const args = [];
|
|
|
|
|
if (this.params.lang) {
|
|
|
|
|
args.push(`--accept-lang=${this.params.lang}`);
|
|
|
|
|
}
|
|
|
|
|
return args;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async run() {
|
2022-08-11 18:44:39 -07:00
|
|
|
await this.bootstrap();
|
2021-08-17 20:54:18 -07:00
|
|
|
|
2023-09-06 11:14:18 -04:00
|
|
|
let status = "done";
|
2022-09-20 17:09:52 -07:00
|
|
|
let exitCode = 0;
|
2020-10-31 13:16:37 -07:00
|
|
|
|
|
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
await this.crawl();
|
2023-09-06 11:14:18 -04:00
|
|
|
const finished = await this.crawlState.isFinished();
|
2023-09-13 10:48:21 -07:00
|
|
|
const stopped = await this.crawlState.isCrawlStopped();
|
2023-10-09 12:28:58 -07:00
|
|
|
const canceled = await this.crawlState.isCrawlCanceled();
|
2023-09-13 22:54:55 -07:00
|
|
|
if (!finished) {
|
2023-10-09 12:28:58 -07:00
|
|
|
if (canceled) {
|
|
|
|
|
status = "canceled";
|
|
|
|
|
} else if (stopped) {
|
2023-09-13 22:54:55 -07:00
|
|
|
status = "done";
|
|
|
|
|
logger.info("Crawl gracefully stopped on request");
|
|
|
|
|
} else if (this.interrupted) {
|
|
|
|
|
status = "interrupted";
|
|
|
|
|
exitCode = 11;
|
|
|
|
|
}
|
2023-09-06 11:14:18 -04:00
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error("Crawl failed", e);
|
2022-09-20 17:09:52 -07:00
|
|
|
exitCode = 9;
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
status = "failing";
|
|
|
|
|
if (await this.crawlState.incFailCount()) {
|
|
|
|
|
status = "failed";
|
|
|
|
|
}
|
|
|
|
|
} finally {
|
2023-10-09 12:28:58 -07:00
|
|
|
await this.setStatusAndExit(exitCode, status);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2021-04-10 13:08:22 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
_behaviorLog(
|
|
|
|
|
{ data, type }: { data: string; type: string },
|
|
|
|
|
pageUrl: string,
|
2023-11-09 19:11:11 -05:00
|
|
|
workerid: WorkerId,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2022-07-08 17:17:46 -07:00
|
|
|
let behaviorLine;
|
2023-02-23 18:50:22 -08:00
|
|
|
let message;
|
|
|
|
|
let details;
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const logDetails = { page: pageUrl, workerid };
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (typeof data === "string") {
|
2023-02-23 18:50:22 -08:00
|
|
|
message = data;
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
details = logDetails;
|
2023-02-23 18:50:22 -08:00
|
|
|
} else {
|
|
|
|
|
message = type === "info" ? "Behavior log" : "Behavior debug";
|
2023-11-09 11:27:11 -08:00
|
|
|
details =
|
|
|
|
|
typeof data === "object"
|
|
|
|
|
? { ...(data as object), ...logDetails }
|
|
|
|
|
: logDetails;
|
2023-02-23 18:50:22 -08:00
|
|
|
}
|
2022-07-08 17:17:46 -07:00
|
|
|
|
2021-04-10 13:08:22 -07:00
|
|
|
switch (type) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case "info":
|
|
|
|
|
behaviorLine = JSON.stringify(data);
|
|
|
|
|
if (behaviorLine !== this.behaviorLastLine) {
|
|
|
|
|
logger.info(message, details, "behaviorScript");
|
|
|
|
|
this.behaviorLastLine = behaviorLine;
|
|
|
|
|
}
|
|
|
|
|
break;
|
2021-04-10 13:08:22 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case "error":
|
|
|
|
|
logger.error(message, details, "behaviorScript");
|
|
|
|
|
break;
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case "debug":
|
|
|
|
|
default:
|
|
|
|
|
logger.debug(message, details, "behaviorScript");
|
2021-04-10 13:08:22 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 16:11:48 -07:00
|
|
|
protected getScope(
|
2023-11-09 11:27:11 -08:00
|
|
|
{
|
|
|
|
|
seedId,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
2024-07-11 19:48:43 -07:00
|
|
|
noOOS,
|
|
|
|
|
}: {
|
|
|
|
|
seedId: number;
|
|
|
|
|
url: string;
|
|
|
|
|
depth: number;
|
|
|
|
|
extraHops: number;
|
|
|
|
|
noOOS: boolean;
|
|
|
|
|
},
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails = {},
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2024-07-11 19:48:43 -07:00
|
|
|
return this.seeds[seedId].isIncluded(
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
|
|
|
|
logDetails,
|
|
|
|
|
noOOS,
|
|
|
|
|
);
|
2024-06-18 16:11:48 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async isInScope(
|
|
|
|
|
{
|
|
|
|
|
seedId,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
|
|
|
|
}: { seedId: number; url: string; depth: number; extraHops: number },
|
|
|
|
|
logDetails = {},
|
|
|
|
|
): Promise<boolean> {
|
2024-06-13 17:18:06 -07:00
|
|
|
const seed = await this.crawlState.getSeedAt(
|
|
|
|
|
this.seeds,
|
|
|
|
|
this.numOriginalSeeds,
|
|
|
|
|
seedId,
|
|
|
|
|
);
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2024-06-18 16:11:48 -07:00
|
|
|
return !!seed.isIncluded(url, depth, extraHops, logDetails);
|
2022-09-20 17:09:52 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async setupPage({
|
|
|
|
|
page,
|
|
|
|
|
cdp,
|
|
|
|
|
workerid,
|
|
|
|
|
callbacks,
|
2024-03-27 09:26:51 -07:00
|
|
|
frameIdToExecId,
|
|
|
|
|
}: WorkerOpts) {
|
2023-11-09 11:27:11 -08:00
|
|
|
await this.browser.setupPage({ page, cdp });
|
|
|
|
|
|
2024-03-27 09:26:51 -07:00
|
|
|
await this.setupExecContextEvents(cdp, frameIdToExecId);
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (
|
|
|
|
|
(this.adBlockRules && this.params.blockAds) ||
|
|
|
|
|
this.blockRules ||
|
|
|
|
|
this.originOverride
|
|
|
|
|
) {
|
2023-04-26 15:41:35 -07:00
|
|
|
await page.setRequestInterception(true);
|
|
|
|
|
|
|
|
|
|
if (this.adBlockRules && this.params.blockAds) {
|
|
|
|
|
await this.adBlockRules.initPage(this.browser, page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.blockRules) {
|
|
|
|
|
await this.blockRules.initPage(this.browser, page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.originOverride) {
|
|
|
|
|
await this.originOverride.initPage(this.browser, page);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.logging.includes("jserrors")) {
|
|
|
|
|
page.on("console", (msg) => {
|
|
|
|
|
if (msg.type() === "error") {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
msg.text(),
|
|
|
|
|
{ location: msg.location(), page: page.url(), workerid },
|
2023-11-09 19:11:11 -05:00
|
|
|
"jsError",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
page.on("pageerror", (e) => {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Page Error",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ ...formatErr(e), page: page.url(), workerid },
|
2023-11-09 19:11:11 -05:00
|
|
|
"jsError",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.screencaster) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug("Start Screencast", { workerid }, "screencast");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.screencaster.screencastPage(page, cdp, workerid);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
await page.exposeFunction(
|
|
|
|
|
ADD_LINK_FUNC,
|
2023-11-09 19:11:11 -05:00
|
|
|
(url: string) => callbacks.addLink && callbacks.addLink(url),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-15 10:12:08 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.behaviorOpts) {
|
2023-11-09 11:27:11 -08:00
|
|
|
await page.exposeFunction(
|
|
|
|
|
BEHAVIOR_LOG_FUNC,
|
|
|
|
|
(logdata: { data: string; type: string }) =>
|
2023-11-09 19:11:11 -05:00
|
|
|
this._behaviorLog(logdata, page.url(), workerid),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
await this.browser.addInitScript(page, behaviors);
|
|
|
|
|
|
|
|
|
|
const initScript = `
|
|
|
|
|
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
|
|
|
|
|
${this.customBehaviors}
|
|
|
|
|
self.__bx_behaviors.selectMainBehavior();
|
|
|
|
|
`;
|
2023-12-13 12:14:53 -08:00
|
|
|
if (!this.behaviorsChecked && this.customBehaviors) {
|
|
|
|
|
await this.checkBehaviorScripts(cdp);
|
|
|
|
|
this.behaviorsChecked = true;
|
|
|
|
|
}
|
2023-07-06 16:09:48 -04:00
|
|
|
|
|
|
|
|
await this.browser.addInitScript(page, initScript);
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-27 09:26:51 -07:00
|
|
|
async setupExecContextEvents(
|
|
|
|
|
cdp: CDPSession,
|
|
|
|
|
frameIdToExecId: Map<string, number>,
|
|
|
|
|
) {
|
|
|
|
|
await cdp.send("Runtime.enable");
|
|
|
|
|
|
|
|
|
|
await cdp.on(
|
|
|
|
|
"Runtime.executionContextCreated",
|
|
|
|
|
(params: Protocol.Runtime.ExecutionContextCreatedEvent) => {
|
|
|
|
|
const { id, auxData } = params.context;
|
|
|
|
|
if (auxData && auxData.isDefault && auxData.frameId) {
|
|
|
|
|
frameIdToExecId.set(auxData.frameId, id);
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
await cdp.on(
|
|
|
|
|
"Runtime.executionContextDestroyed",
|
|
|
|
|
(params: Protocol.Runtime.ExecutionContextDestroyedEvent) => {
|
|
|
|
|
const { executionContextId } = params;
|
|
|
|
|
for (const [frameId, execId] of frameIdToExecId.entries()) {
|
|
|
|
|
if (execId === executionContextId) {
|
|
|
|
|
frameIdToExecId.delete(frameId);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
await cdp.on("Runtime.executionContextsCleared", () => {
|
|
|
|
|
frameIdToExecId.clear();
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
loadCustomBehaviors(filename: string) {
|
2023-07-06 16:09:48 -04:00
|
|
|
let str = "";
|
|
|
|
|
|
2023-12-13 12:14:53 -08:00
|
|
|
for (const { contents } of collectAllFileSources(filename, ".js")) {
|
|
|
|
|
str += `self.__bx_behaviors.load(${contents});\n`;
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return str;
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-13 12:14:53 -08:00
|
|
|
async checkBehaviorScripts(cdp: CDPSession) {
|
|
|
|
|
const filename = this.params.customBehaviors;
|
|
|
|
|
|
|
|
|
|
if (!filename) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (const { path, contents } of collectAllFileSources(filename, ".js")) {
|
|
|
|
|
await this.browser.checkScript(cdp, path, contents);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async getFavicon(page: Page, logDetails: LogDetails): Promise<string> {
|
2023-09-17 12:50:39 -07:00
|
|
|
try {
|
|
|
|
|
const resp = await fetch("http://127.0.0.1:9221/json");
|
|
|
|
|
if (resp.status === 200) {
|
|
|
|
|
const browserJson = await resp.json();
|
|
|
|
|
for (const jsons of browserJson) {
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
if (jsons.id === (page.target() as any)._targetId) {
|
2023-09-17 12:50:39 -07:00
|
|
|
return jsons.faviconUrl;
|
|
|
|
|
}
|
2023-09-10 11:29:35 -07:00
|
|
|
}
|
|
|
|
|
}
|
2023-09-17 12:50:39 -07:00
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
2023-09-10 11:29:35 -07:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Failed to fetch favicon from browser /json endpoint",
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
|
|
|
|
return "";
|
2023-09-10 11:29:35 -07:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
async crawlPage(opts: WorkerState): Promise<void> {
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.writeStats();
|
|
|
|
|
|
2024-05-24 14:51:51 -07:00
|
|
|
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
|
2023-09-15 10:12:08 -07:00
|
|
|
data.callbacks = callbacks;
|
2023-02-23 18:50:22 -08:00
|
|
|
|
2024-06-20 16:35:30 -07:00
|
|
|
const { url, seedId } = data;
|
|
|
|
|
|
|
|
|
|
const auth = this.seeds[seedId].authHeader();
|
|
|
|
|
|
|
|
|
|
if (auth) {
|
|
|
|
|
logger.debug("Setting HTTP basic auth for seed", {
|
|
|
|
|
seedId,
|
|
|
|
|
seedUrl: this.seeds[seedId].url,
|
|
|
|
|
});
|
|
|
|
|
await page.setExtraHTTPHeaders({ Authorization: auth });
|
|
|
|
|
opts.isAuthSet = true;
|
|
|
|
|
} else if (opts.isAuthSet) {
|
|
|
|
|
await page.setExtraHTTPHeaders({});
|
|
|
|
|
}
|
2023-03-13 18:07:59 -04:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const logDetails = { page: url, workerid };
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.logDetails = logDetails;
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
data.workerid = workerid;
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2024-05-24 14:51:51 -07:00
|
|
|
if (directFetchCapture) {
|
2023-11-07 21:38:50 -08:00
|
|
|
try {
|
2024-05-24 14:51:51 -07:00
|
|
|
const { fetched, mime, ts } = await timedRun(
|
|
|
|
|
directFetchCapture({ url, headers: this.headers, cdp }),
|
2023-11-07 21:38:50 -08:00
|
|
|
FETCH_TIMEOUT_SECS,
|
|
|
|
|
"Direct fetch capture attempt timed out",
|
|
|
|
|
logDetails,
|
|
|
|
|
"fetch",
|
2023-11-09 19:11:11 -05:00
|
|
|
true,
|
2023-11-07 21:38:50 -08:00
|
|
|
);
|
2024-05-24 14:51:51 -07:00
|
|
|
if (mime) {
|
|
|
|
|
data.mime = mime;
|
2024-06-26 09:16:24 -07:00
|
|
|
data.isHTMLPage = isHTMLMime(mime);
|
2024-05-24 14:51:51 -07:00
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
if (fetched) {
|
|
|
|
|
data.loadState = LoadState.FULL_PAGE_LOADED;
|
2024-02-28 22:56:12 -08:00
|
|
|
data.status = 200;
|
2024-05-24 14:51:51 -07:00
|
|
|
data.ts = ts || new Date();
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
|
|
|
|
"Direct fetch successful",
|
2024-05-24 14:51:51 -07:00
|
|
|
{ url, mime, ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"fetch",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2024-03-22 17:32:42 -07:00
|
|
|
return;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
} catch (e) {
|
2023-11-13 09:16:57 -08:00
|
|
|
// filtered out direct fetch
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Direct fetch response not accepted, continuing with browser fetch",
|
|
|
|
|
logDetails,
|
|
|
|
|
"fetch",
|
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
// run custom driver here
|
2023-11-09 11:27:11 -08:00
|
|
|
await this.driver({ page, data, crawler: this });
|
2021-06-07 17:43:36 -07:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.title = await page.title();
|
2023-09-10 11:29:35 -07:00
|
|
|
data.favicon = await this.getFavicon(page, logDetails);
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
await this.doPostLoadActions(opts);
|
2024-06-26 09:16:24 -07:00
|
|
|
|
|
|
|
|
await this.awaitPageExtraDelay(opts);
|
2024-03-22 17:32:42 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async doPostLoadActions(opts: WorkerState, saveOutput = false) {
|
|
|
|
|
const { page, cdp, data, workerid } = opts;
|
|
|
|
|
const { url } = data;
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!data.isHTMLPage) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
const logDetails = { page: url, workerid };
|
2023-10-31 23:05:30 -07:00
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
if (this.params.screenshot && this.screenshotWriter) {
|
2023-11-09 11:27:11 -08:00
|
|
|
const screenshots = new Screenshots({
|
|
|
|
|
browser: this.browser,
|
|
|
|
|
page,
|
|
|
|
|
url,
|
2024-03-26 14:54:27 -07:00
|
|
|
writer: this.screenshotWriter,
|
2023-11-09 11:27:11 -08:00
|
|
|
});
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.screenshot.includes("view")) {
|
2024-03-22 17:32:42 -07:00
|
|
|
await screenshots.take("view", saveOutput ? data : null);
|
2021-03-13 16:48:31 -08:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
if (this.params.screenshot.includes("fullPage")) {
|
|
|
|
|
await screenshots.takeFullPage();
|
|
|
|
|
}
|
|
|
|
|
if (this.params.screenshot.includes("thumbnail")) {
|
|
|
|
|
await screenshots.takeThumbnail();
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2023-10-31 23:05:30 -07:00
|
|
|
let textextract = null;
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (this.textWriter) {
|
2023-11-09 11:27:11 -08:00
|
|
|
textextract = new TextExtractViaSnapshot(cdp, {
|
2024-03-26 14:54:27 -07:00
|
|
|
writer: this.textWriter,
|
2023-11-09 11:27:11 -08:00
|
|
|
url,
|
2024-03-22 17:32:42 -07:00
|
|
|
skipDocs: this.skipTextDocs,
|
2023-11-09 11:27:11 -08:00
|
|
|
});
|
2024-03-22 17:32:42 -07:00
|
|
|
const { text } = await textextract.extractAndStoreText(
|
2023-11-09 11:27:11 -08:00
|
|
|
"text",
|
|
|
|
|
false,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.text.includes("to-warc"),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-10-31 23:05:30 -07:00
|
|
|
|
2024-08-09 16:20:56 -04:00
|
|
|
if (text !== null && (this.textInPages || saveOutput)) {
|
2023-10-31 23:05:30 -07:00
|
|
|
data.text = text;
|
|
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
2022-12-21 12:06:13 -05:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.loadState = LoadState.EXTRACTION_DONE;
|
2022-12-21 12:06:13 -05:00
|
|
|
|
2024-03-28 10:21:31 -07:00
|
|
|
if (this.params.behaviorOpts && data.status < 400) {
|
2024-06-26 09:16:24 -07:00
|
|
|
if (data.skipBehaviors) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
} else {
|
|
|
|
|
const res = await timedRun(
|
2024-03-27 09:26:51 -07:00
|
|
|
this.runBehaviors(
|
|
|
|
|
page,
|
|
|
|
|
cdp,
|
|
|
|
|
data.filteredFrames,
|
|
|
|
|
opts.frameIdToExecId,
|
|
|
|
|
logDetails,
|
|
|
|
|
),
|
2023-03-22 14:50:18 -04:00
|
|
|
this.params.behaviorTimeout,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
"Behaviors timed out",
|
|
|
|
|
logDetails,
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2024-06-13 15:42:27 -04:00
|
|
|
true,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
);
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
await this.netIdle(page, logDetails);
|
|
|
|
|
|
|
|
|
|
if (res) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
data.loadState = LoadState.BEHAVIORS_DONE;
|
2022-03-22 17:41:51 -07:00
|
|
|
}
|
2023-10-31 23:05:30 -07:00
|
|
|
|
|
|
|
|
if (textextract && this.params.text.includes("final-to-warc")) {
|
|
|
|
|
await textextract.extractAndStoreText("textFinal", true, true);
|
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
}
|
2024-06-26 09:16:24 -07:00
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
async awaitPageExtraDelay(opts: WorkerState) {
|
2023-03-22 14:50:18 -04:00
|
|
|
if (this.params.pageExtraDelay) {
|
2024-06-26 09:16:24 -07:00
|
|
|
const {
|
|
|
|
|
data: { url: page },
|
|
|
|
|
workerid,
|
|
|
|
|
} = opts;
|
|
|
|
|
|
|
|
|
|
const logDetails = { page, workerid };
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
|
|
|
|
`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-03-22 14:50:18 -04:00
|
|
|
await sleep(this.params.pageExtraDelay);
|
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async pageFinished(data: PageState) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
await this.writePage(data);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
// if page loaded, considered page finished successfully
|
|
|
|
|
// (even if behaviors timed out)
|
|
|
|
|
const { loadState, logDetails } = data;
|
|
|
|
|
|
|
|
|
|
if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info("Page Finished", { loadState, ...logDetails }, "pageStatus");
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
|
|
|
|
await this.crawlState.markFinished(data.url);
|
|
|
|
|
|
|
|
|
|
if (this.healthChecker) {
|
|
|
|
|
this.healthChecker.resetErrors();
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
await this.crawlState.markFailed(data.url);
|
|
|
|
|
|
|
|
|
|
if (this.healthChecker) {
|
|
|
|
|
this.healthChecker.incError();
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-03-14 10:41:56 -07:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.serializeConfig();
|
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
await this.checkLimits();
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-08 12:52:30 -08:00
|
|
|
async teardownPage({ workerid }: WorkerOpts) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
if (this.screencaster) {
|
|
|
|
|
await this.screencaster.stopById(workerid);
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2023-03-13 14:48:04 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async workerIdle(workerid: WorkerId) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
if (this.screencaster) {
|
|
|
|
|
//logger.debug("End Screencast", {workerid}, "screencast");
|
|
|
|
|
await this.screencaster.stopById(workerid, true);
|
|
|
|
|
}
|
2021-02-08 22:21:34 -08:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async runBehaviors(
|
|
|
|
|
page: Page,
|
|
|
|
|
cdp: CDPSession,
|
|
|
|
|
frames: Frame[],
|
2024-03-27 09:26:51 -07:00
|
|
|
frameIdToExecId: Map<string, number>,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2023-03-08 21:31:19 -05:00
|
|
|
try {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
frames = frames || page.frames();
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
|
|
|
|
"Running behaviors",
|
|
|
|
|
{
|
|
|
|
|
frames: frames.length,
|
|
|
|
|
frameUrls: frames.map((frame) => frame.url()),
|
|
|
|
|
...logDetails,
|
|
|
|
|
},
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
const results = await Promise.allSettled(
|
2023-11-09 11:27:11 -08:00
|
|
|
frames.map((frame) =>
|
|
|
|
|
this.browser.evaluateWithCLI(
|
|
|
|
|
cdp,
|
2024-03-27 09:26:51 -07:00
|
|
|
frame,
|
|
|
|
|
frameIdToExecId,
|
2023-11-09 11:27:11 -08:00
|
|
|
`
|
2023-09-20 14:02:37 -05:00
|
|
|
if (!self.__bx_behaviors) {
|
|
|
|
|
console.error("__bx_behaviors missing, can't run behaviors");
|
|
|
|
|
} else {
|
|
|
|
|
self.__bx_behaviors.run();
|
2023-11-09 11:27:11 -08:00
|
|
|
}`,
|
|
|
|
|
logDetails,
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
|
|
|
|
),
|
|
|
|
|
),
|
2023-03-08 21:31:19 -05:00
|
|
|
);
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
for (const res of results) {
|
2024-03-27 09:26:51 -07:00
|
|
|
const { status, reason }: { status: string; reason?: unknown } = res;
|
2023-09-14 19:48:41 -07:00
|
|
|
if (status === "rejected") {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Behavior run partially failed",
|
2024-03-27 09:26:51 -07:00
|
|
|
{ reason: formatErr(reason), ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-14 19:48:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
|
|
|
|
"Behaviors finished",
|
|
|
|
|
{ finished: results.length, ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-14 19:48:41 -07:00
|
|
|
return true;
|
2023-03-08 21:31:19 -05:00
|
|
|
} catch (e) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn(
|
|
|
|
|
"Behavior run failed",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ ...formatErr(e), ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-14 19:48:41 -07:00
|
|
|
return false;
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async shouldIncludeFrame(frame: Frame, logDetails: LogDetails) {
|
2023-01-23 16:47:33 -08:00
|
|
|
if (!frame.parentFrame()) {
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return frame;
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
const frameUrl = frame.url();
|
2023-01-23 16:47:33 -08:00
|
|
|
|
2024-05-09 11:05:33 +02:00
|
|
|
if (!frameUrl) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
|
|
|
|
|
// if there's no tag or an iframe tag, then assume its a regular frame
|
2024-05-09 11:05:33 +02:00
|
|
|
let tagName = "";
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
tagName = await timedRun(
|
|
|
|
|
frame.evaluate(
|
|
|
|
|
"self && self.frameElement && self.frameElement.tagName",
|
|
|
|
|
),
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Frame check timed out",
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
|
|
|
|
}
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2023-09-14 19:48:41 -07:00
|
|
|
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
"Skipping processing non-frame object",
|
|
|
|
|
{ tagName, frameUrl, ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
let res;
|
|
|
|
|
|
|
|
|
|
if (frameUrl === "about:blank") {
|
|
|
|
|
res = false;
|
|
|
|
|
} else {
|
2023-11-09 11:27:11 -08:00
|
|
|
res = this.adBlockRules && !this.adBlockRules.isAdUrl(frameUrl);
|
2023-02-23 18:50:22 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!res) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
"Skipping processing frame",
|
|
|
|
|
{ frameUrl, ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"behavior",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
return res ? frame : null;
|
2023-01-23 16:47:33 -08:00
|
|
|
}
|
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
async checkLimits() {
|
|
|
|
|
let interrupt = false;
|
|
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
const size = this.params.dryRun ? 0 : await getDirSize(this.archivesDir);
|
2023-04-19 21:10:02 -04:00
|
|
|
|
|
|
|
|
await this.crawlState.setArchiveSize(size);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2023-03-31 12:35:18 -04:00
|
|
|
if (this.params.sizeLimit) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (size >= this.params.sizeLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
2023-11-09 19:11:11 -05:00
|
|
|
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
interrupt = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.timeLimit) {
|
2023-03-22 14:50:18 -04:00
|
|
|
const elapsed = secondsElapsed(this.startTime);
|
|
|
|
|
if (elapsed >= this.params.timeLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
2023-11-09 19:11:11 -05:00
|
|
|
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
interrupt = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-31 12:35:18 -04:00
|
|
|
if (this.params.diskUtilization) {
|
2023-07-06 00:58:28 -04:00
|
|
|
// Check that disk usage isn't already or soon to be above threshold
|
2024-06-07 19:13:15 +02:00
|
|
|
const diskUtil = await checkDiskUtilization(
|
|
|
|
|
this.collDir,
|
|
|
|
|
this.params,
|
|
|
|
|
size,
|
|
|
|
|
);
|
2023-07-06 00:58:28 -04:00
|
|
|
if (diskUtil.stop === true) {
|
2023-03-31 12:35:18 -04:00
|
|
|
interrupt = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-03 20:21:30 -07:00
|
|
|
if (this.params.failOnFailedLimit) {
|
2024-05-21 19:35:43 -04:00
|
|
|
const numFailed = await this.crawlState.numFailed();
|
|
|
|
|
const failedLimit = this.params.failOnFailedLimit;
|
|
|
|
|
if (numFailed >= failedLimit) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.fatal(
|
2024-05-21 19:35:43 -04:00
|
|
|
`Failed threshold reached ${numFailed} >= ${failedLimit}, failing crawl`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-10-03 20:21:30 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (interrupt) {
|
2023-08-15 11:34:39 -07:00
|
|
|
this.uploadAndDeleteLocal = true;
|
2023-09-13 10:48:21 -07:00
|
|
|
this.gracefulFinishOnInterrupt();
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-13 10:48:21 -07:00
|
|
|
gracefulFinishOnInterrupt() {
|
2022-09-20 17:09:52 -07:00
|
|
|
this.interrupted = true;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Crawler interrupted, gracefully finishing current pages");
|
2023-09-13 10:48:21 -07:00
|
|
|
if (!this.params.waitOnDone && !this.params.restartsOnError) {
|
2022-09-20 17:09:52 -07:00
|
|
|
this.finalExit = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-09 12:28:58 -07:00
|
|
|
async checkCanceled() {
|
2023-11-09 11:27:11 -08:00
|
|
|
if (this.crawlState && (await this.crawlState.isCrawlCanceled())) {
|
2023-10-09 12:28:58 -07:00
|
|
|
await this.setStatusAndExit(0, "canceled");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async setStatusAndExit(exitCode: number, status: string) {
|
2023-10-09 12:28:58 -07:00
|
|
|
logger.info(`Exiting, Crawl status: ${status}`);
|
|
|
|
|
|
2023-10-03 20:21:30 -07:00
|
|
|
await this.closeLog();
|
|
|
|
|
|
2023-10-09 12:28:58 -07:00
|
|
|
if (this.crawlState && status) {
|
|
|
|
|
await this.crawlState.setStatus(status);
|
2023-10-02 20:55:52 -04:00
|
|
|
}
|
|
|
|
|
process.exit(exitCode);
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
async serializeAndExit() {
|
|
|
|
|
await this.serializeConfig();
|
2023-10-02 20:55:52 -04:00
|
|
|
|
2023-10-03 20:21:30 -07:00
|
|
|
if (this.interrupted) {
|
2024-03-21 13:56:05 -07:00
|
|
|
await this.browser.close();
|
|
|
|
|
await closeWorkers(0);
|
2024-03-26 14:54:27 -07:00
|
|
|
await this.closeFiles();
|
2024-07-23 18:50:26 -07:00
|
|
|
if (!this.done) {
|
|
|
|
|
await this.setStatusAndExit(13, "interrupted");
|
|
|
|
|
return;
|
|
|
|
|
}
|
2023-10-03 20:21:30 -07:00
|
|
|
}
|
2024-07-23 18:50:26 -07:00
|
|
|
await this.setStatusAndExit(0, "done");
|
2022-09-20 17:09:52 -07:00
|
|
|
}
|
|
|
|
|
|
2023-08-22 09:16:00 -07:00
|
|
|
async isCrawlRunning() {
|
|
|
|
|
if (this.interrupted) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-09 12:28:58 -07:00
|
|
|
if (await this.crawlState.isCrawlCanceled()) {
|
|
|
|
|
await this.setStatusAndExit(0, "canceled");
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-22 09:16:00 -07:00
|
|
|
if (await this.crawlState.isCrawlStopped()) {
|
2023-09-13 10:48:21 -07:00
|
|
|
logger.info("Crawler is stopped");
|
2023-08-22 09:16:00 -07:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async crawl() {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (this.params.healthCheckPort) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.healthChecker = new HealthChecker(
|
|
|
|
|
this.params.healthCheckPort,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.workers,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
2022-10-24 15:30:10 +02:00
|
|
|
const driverUrl = new URL(this.params.driver, import.meta.url);
|
2023-11-09 11:27:11 -08:00
|
|
|
this.driver = (await import(driverUrl.href)).default;
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
2020-11-01 19:22:53 -08:00
|
|
|
return;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
|
2022-06-30 19:24:26 -07:00
|
|
|
await this.initCrawlState();
|
|
|
|
|
|
|
|
|
|
let initState = await this.crawlState.getStatus();
|
|
|
|
|
|
|
|
|
|
while (initState === "debug") {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Paused for debugging, will continue after manual resume");
|
2022-06-30 19:24:26 -07:00
|
|
|
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(60);
|
2022-06-30 19:24:26 -07:00
|
|
|
|
|
|
|
|
initState = await this.crawlState.getStatus();
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-08 23:39:26 -07:00
|
|
|
// if already done, don't crawl anymore
|
|
|
|
|
if (initState === "done") {
|
|
|
|
|
this.done = true;
|
|
|
|
|
|
|
|
|
|
if (this.params.waitOnDone) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Already done, waiting for signal to exit...");
|
2022-09-08 23:39:26 -07:00
|
|
|
|
|
|
|
|
// wait forever until signal
|
|
|
|
|
await new Promise(() => {});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-05 14:27:17 -05:00
|
|
|
if (this.params.generateWACZ) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
this.storage = initStorage();
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
|
|
|
|
|
2023-05-03 16:25:59 -07:00
|
|
|
if (POST_CRAWL_STATES.includes(initState)) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info("crawl already finished, running post-crawl tasks", {
|
|
|
|
|
state: initState,
|
|
|
|
|
});
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.postCrawl();
|
|
|
|
|
return;
|
|
|
|
|
} else if (await this.crawlState.isCrawlStopped()) {
|
|
|
|
|
logger.info("crawl stopped, running post-crawl tasks");
|
2023-09-13 10:48:21 -07:00
|
|
|
this.finalExit = true;
|
2023-03-08 21:31:19 -05:00
|
|
|
await this.postCrawl();
|
|
|
|
|
return;
|
2023-10-09 12:28:58 -07:00
|
|
|
} else if (await this.crawlState.isCrawlCanceled()) {
|
|
|
|
|
logger.info("crawl canceled, will exit");
|
|
|
|
|
return;
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
2022-06-30 19:24:26 -07:00
|
|
|
|
|
|
|
|
await this.crawlState.setStatus("running");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
this.pagesFH = await this.initPages(this.seedPagesFile, "Seed Pages");
|
|
|
|
|
this.extraPagesFH = await this.initPages(
|
|
|
|
|
this.otherPagesFile,
|
|
|
|
|
"Non-Seed Pages",
|
|
|
|
|
);
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
this.adBlockRules = new AdBlockRules(
|
|
|
|
|
this.captureBasePrefix,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.adBlockMessage,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-10-25 10:53:32 -04:00
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
if (this.params.blockRules && this.params.blockRules.length) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.blockRules = new BlockRules(
|
|
|
|
|
this.params.blockRules,
|
|
|
|
|
this.captureBasePrefix,
|
2023-11-09 19:11:11 -05:00
|
|
|
this.params.blockMessage,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2021-07-19 15:49:43 -07:00
|
|
|
}
|
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
this.screencaster = this.initScreenCaster();
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
if (this.params.originOverride && this.params.originOverride.length) {
|
2023-04-19 19:17:15 -07:00
|
|
|
this.originOverride = new OriginOverride(this.params.originOverride);
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
await this._addInitialSeeds();
|
2020-11-14 21:55:02 +00:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.browser.launch({
|
2023-04-24 10:26:56 -07:00
|
|
|
profileUrl: this.params.profile,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
headless: this.params.headless,
|
|
|
|
|
emulateDevice: this.emulateDevice,
|
2024-03-22 13:37:14 -07:00
|
|
|
swOpt: this.params.serviceWorker,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
chromeOptions: {
|
2024-06-10 13:11:00 -07:00
|
|
|
proxy: this.proxyServer,
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
userAgent: this.emulateDevice.userAgent,
|
2023-11-09 11:27:11 -08:00
|
|
|
extraArgs: this.extraChromeArgs(),
|
2023-09-18 15:24:33 -07:00
|
|
|
},
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
ondisconnect: (err: any) => {
|
2023-09-18 15:24:33 -07:00
|
|
|
this.interrupted = true;
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.error(
|
|
|
|
|
"Browser disconnected (crashed?), interrupting crawl",
|
|
|
|
|
err,
|
2023-11-09 19:11:11 -05:00
|
|
|
"browser",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
|
|
|
|
},
|
2024-03-22 17:32:42 -07:00
|
|
|
|
|
|
|
|
recording: this.recording,
|
2023-11-09 19:11:11 -05:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
2023-11-09 11:27:11 -08:00
|
|
|
} as any);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-03-22 14:50:18 -04:00
|
|
|
// --------------
|
|
|
|
|
// Run Crawl Here!
|
2024-03-22 17:32:42 -07:00
|
|
|
await runWorkers(this, this.params.workers, this.maxPageTime);
|
2023-03-22 14:50:18 -04:00
|
|
|
// --------------
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
await this.serializeConfig(true);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
await this.closePages();
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
await this.closeFiles();
|
|
|
|
|
|
2023-09-15 00:16:19 +02:00
|
|
|
await this.writeStats();
|
2023-02-23 18:50:22 -08:00
|
|
|
|
2023-09-13 10:48:21 -07:00
|
|
|
// if crawl has been stopped, mark as final exit for post-crawl tasks
|
|
|
|
|
if (await this.crawlState.isCrawlStopped()) {
|
|
|
|
|
this.finalExit = true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
await this.postCrawl();
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
async closePages() {
|
|
|
|
|
if (this.pagesFH) {
|
|
|
|
|
try {
|
|
|
|
|
await new Promise<void>((resolve) =>
|
|
|
|
|
this.pagesFH!.close(() => resolve()),
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
|
|
|
|
} finally {
|
|
|
|
|
this.pagesFH = null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.extraPagesFH) {
|
|
|
|
|
try {
|
|
|
|
|
await new Promise<void>((resolve) =>
|
|
|
|
|
this.extraPagesFH!.close(() => resolve()),
|
|
|
|
|
);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
|
|
|
|
} finally {
|
|
|
|
|
this.extraPagesFH = null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
async closeFiles() {
|
|
|
|
|
if (this.textWriter) {
|
|
|
|
|
await this.textWriter.flush();
|
|
|
|
|
}
|
|
|
|
|
if (this.screenshotWriter) {
|
|
|
|
|
await this.screenshotWriter.flush();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
protected async _addInitialSeeds() {
|
2024-06-13 17:18:06 -07:00
|
|
|
for (let i = 0; i < this.seeds.length; i++) {
|
|
|
|
|
const seed = this.seeds[i];
|
2024-03-22 17:32:42 -07:00
|
|
|
if (!(await this.queueUrl(i, seed.url, 0, 0))) {
|
|
|
|
|
if (this.limitHit) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (seed.sitemap) {
|
|
|
|
|
await timedRun(
|
|
|
|
|
this.parseSitemap(seed, i),
|
|
|
|
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
|
|
|
|
"Sitemap initial fetch timed out",
|
|
|
|
|
{ sitemap: seed.sitemap, seed: seed.url },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-08 21:31:19 -05:00
|
|
|
async postCrawl() {
|
2024-06-07 10:34:19 -07:00
|
|
|
if (this.params.combineWARC && !this.params.dryRun) {
|
2021-03-31 13:41:27 -04:00
|
|
|
await this.combineWARC();
|
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2024-06-07 10:34:19 -07:00
|
|
|
if (this.params.generateCDX && !this.params.dryRun) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating CDX");
|
2023-11-09 11:27:11 -08:00
|
|
|
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-cdx");
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
const warcList = await fsp.readdir(this.archivesDir);
|
2023-11-09 11:27:11 -08:00
|
|
|
const warcListFull = warcList.map((filename) =>
|
2024-03-22 17:32:42 -07:00
|
|
|
path.join(this.archivesDir, filename),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
|
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
|
|
|
|
const params = [
|
|
|
|
|
"-o",
|
|
|
|
|
path.join(this.collDir, "indexes", "index.cdxj"),
|
2023-11-09 11:27:11 -08:00
|
|
|
...warcListFull,
|
2023-11-07 21:38:50 -08:00
|
|
|
];
|
2023-11-09 11:27:11 -08:00
|
|
|
const indexResult = await this.awaitProcess(
|
2023-11-09 19:11:11 -05:00
|
|
|
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-05-07 16:00:56 -04:00
|
|
|
if (indexResult === 0) {
|
|
|
|
|
logger.debug("Indexing complete, CDX successfully created");
|
|
|
|
|
} else {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.error("Error indexing and generating CDX", {
|
|
|
|
|
"status code": indexResult,
|
|
|
|
|
});
|
2023-05-07 16:00:56 -04:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2023-09-13 10:04:09 -07:00
|
|
|
logger.info("Crawling done");
|
2023-02-24 18:31:08 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (
|
|
|
|
|
this.params.generateWACZ &&
|
2024-06-07 10:34:19 -07:00
|
|
|
!this.params.dryRun &&
|
2023-11-09 11:27:11 -08:00
|
|
|
(!this.interrupted || this.finalExit || this.uploadAndDeleteLocal)
|
|
|
|
|
) {
|
2023-08-01 00:04:10 -07:00
|
|
|
const uploaded = await this.generateWACZ();
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
|
2023-08-01 00:04:10 -07:00
|
|
|
if (uploaded && this.uploadAndDeleteLocal) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.info(
|
2023-11-09 19:11:11 -05:00
|
|
|
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
try {
|
|
|
|
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 17:09:52 -07:00
|
|
|
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
this.done = true;
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("All done, waiting for signal...");
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
await this.crawlState.setStatus("done");
|
|
|
|
|
|
|
|
|
|
// wait forever until signal
|
|
|
|
|
await new Promise(() => {});
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
}
|
2022-01-26 16:06:10 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async closeLog(): Promise<void> {
|
2023-02-24 18:31:08 -08:00
|
|
|
// close file-based log
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.setExternalLogStream(null);
|
2023-09-13 10:04:09 -07:00
|
|
|
if (!this.logFH) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2023-02-24 18:31:08 -08:00
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
await new Promise<void>((resolve) => this.logFH.close(() => resolve()));
|
2023-02-24 18:31:08 -08:00
|
|
|
} catch (e) {
|
|
|
|
|
// ignore
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
async generateWACZ() {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating WACZ");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-wacz");
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// Get a list of the warcs inside
|
2024-03-22 17:32:42 -07:00
|
|
|
const warcFileList = await fsp.readdir(this.archivesDir);
|
2022-02-08 15:31:55 -08:00
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
// is finished (>0 pages and all pages written)
|
|
|
|
|
const isFinished = await this.crawlState.isFinished();
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Num WARC Files: ${warcFileList.length}`);
|
2022-02-08 15:31:55 -08:00
|
|
|
if (!warcFileList.length) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
// if finished, just return
|
2023-11-09 11:27:11 -08:00
|
|
|
if (isFinished || (await this.crawlState.isCrawlCanceled())) {
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
return;
|
|
|
|
|
}
|
2023-09-13 10:48:21 -07:00
|
|
|
// if stopped, won't get anymore data
|
2023-05-19 07:38:16 -07:00
|
|
|
if (await this.crawlState.isCrawlStopped()) {
|
2023-09-13 10:48:21 -07:00
|
|
|
// possibly restarted after committing, so assume done here!
|
|
|
|
|
if ((await this.crawlState.numDone()) > 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2023-05-19 07:38:16 -07:00
|
|
|
}
|
2023-10-03 20:21:30 -07:00
|
|
|
// fail crawl otherwise
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.fatal("No WARC Files, assuming crawl failed");
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
|
2023-09-13 10:04:09 -07:00
|
|
|
logger.debug("End of log file, storing logs in WACZ");
|
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// Build the argument list to pass to the wacz create command
|
|
|
|
|
const waczFilename = this.params.collection.concat(".wacz");
|
|
|
|
|
const waczPath = path.join(this.collDir, waczFilename);
|
|
|
|
|
|
2023-02-24 18:31:08 -08:00
|
|
|
const createArgs = [
|
|
|
|
|
"create",
|
2023-11-09 11:27:11 -08:00
|
|
|
"-o",
|
|
|
|
|
waczPath,
|
|
|
|
|
"--pages",
|
2024-04-11 13:55:52 -07:00
|
|
|
this.seedPagesFile,
|
|
|
|
|
"--extra-pages",
|
|
|
|
|
this.otherPagesFile,
|
|
|
|
|
"--copy-pages",
|
2023-11-09 11:27:11 -08:00
|
|
|
"--log-directory",
|
|
|
|
|
this.logDir,
|
2023-02-24 18:31:08 -08:00
|
|
|
];
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
|
if (process.env.WACZ_SIGN_URL) {
|
|
|
|
|
createArgs.push("--signing-url");
|
|
|
|
|
createArgs.push(process.env.WACZ_SIGN_URL);
|
|
|
|
|
if (process.env.WACZ_SIGN_TOKEN) {
|
|
|
|
|
createArgs.push("--signing-token");
|
|
|
|
|
createArgs.push(process.env.WACZ_SIGN_TOKEN);
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2022-02-08 15:31:55 -08:00
|
|
|
|
2023-04-04 10:46:03 -04:00
|
|
|
if (this.params.title) {
|
|
|
|
|
createArgs.push("--title");
|
|
|
|
|
createArgs.push(this.params.title);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.description) {
|
|
|
|
|
createArgs.push("--desc");
|
|
|
|
|
createArgs.push(this.params.description);
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
createArgs.push("-f");
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
warcFileList.forEach((val) =>
|
|
|
|
|
createArgs.push(path.join(this.archivesDir, val)),
|
|
|
|
|
);
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
|
// create WACZ
|
2023-11-09 11:27:11 -08:00
|
|
|
const waczResult = await this.awaitProcess(
|
2024-03-21 08:16:59 -07:00
|
|
|
child_process.spawn("wacz", createArgs, { detached: RUN_DETACHED }),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-02-08 15:31:55 -08:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (waczResult !== 0) {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.error("Error creating WACZ", { "status code": waczResult });
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.fatal("Unable to write WACZ successfully");
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
|
// Verify WACZ
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
/*
|
|
|
|
|
const validateArgs = ["validate"];
|
|
|
|
|
validateArgs.push("-f");
|
2022-02-08 15:31:55 -08:00
|
|
|
validateArgs.push(waczPath);
|
|
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));
|
2022-02-08 15:31:55 -08:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
if (waczVerifyResult !== 0) {
|
2022-02-08 15:31:55 -08:00
|
|
|
console.log("validate", waczVerifyResult);
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.fatal("Unable to verify WACZ created successfully");
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
*/
|
2022-02-08 15:31:55 -08:00
|
|
|
if (this.storage) {
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("uploading-wacz");
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
const filename = process.env.STORE_FILENAME || "@ts-@id.wacz";
|
|
|
|
|
const targetFilename = interpolateFilename(filename, this.crawlId);
|
|
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
await this.storage.uploadCollWACZ(waczPath, targetFilename, isFinished);
|
2023-08-01 00:04:10 -07:00
|
|
|
return true;
|
2022-02-08 15:31:55 -08:00
|
|
|
}
|
2023-08-01 00:04:10 -07:00
|
|
|
|
|
|
|
|
return false;
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
awaitProcess(proc: ChildProcess) {
|
|
|
|
|
const stdout: string[] = [];
|
|
|
|
|
const stderr: string[] = [];
|
2023-05-07 16:00:56 -04:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
proc.stdout!.on("data", (data) => {
|
2023-05-07 16:00:56 -04:00
|
|
|
stdout.push(data.toString());
|
2022-12-15 12:38:41 -05:00
|
|
|
});
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
proc.stderr!.on("data", (data) => {
|
2023-05-07 16:00:56 -04:00
|
|
|
stderr.push(data.toString());
|
2022-12-15 12:38:41 -05:00
|
|
|
});
|
2023-05-07 16:00:56 -04:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
return new Promise((resolve) => {
|
2023-05-07 16:00:56 -04:00
|
|
|
proc.on("close", (code) => {
|
|
|
|
|
if (stdout.length) {
|
|
|
|
|
logger.debug(stdout.join("\n"));
|
|
|
|
|
}
|
|
|
|
|
if (stderr.length && this.params.logging.includes("debug")) {
|
2023-09-13 10:05:05 -07:00
|
|
|
logger.debug(stderr.join("\n"));
|
2023-05-07 16:00:56 -04:00
|
|
|
}
|
|
|
|
|
resolve(code);
|
|
|
|
|
});
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-26 15:41:35 -07:00
|
|
|
logMemory() {
|
|
|
|
|
const memUsage = process.memoryUsage();
|
|
|
|
|
const { heapUsed, heapTotal } = memUsage;
|
|
|
|
|
this.maxHeapUsed = Math.max(this.maxHeapUsed || 0, heapUsed);
|
|
|
|
|
this.maxHeapTotal = Math.max(this.maxHeapTotal || 0, heapTotal);
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.debug(
|
|
|
|
|
"Memory",
|
|
|
|
|
{
|
|
|
|
|
maxHeapUsed: this.maxHeapUsed,
|
|
|
|
|
maxHeapTotal: this.maxHeapTotal,
|
|
|
|
|
...memUsage,
|
|
|
|
|
},
|
2023-11-14 21:54:40 -08:00
|
|
|
"memoryStatus",
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-04-26 15:41:35 -07:00
|
|
|
}
|
|
|
|
|
|
2023-09-15 00:16:19 +02:00
|
|
|
async writeStats() {
|
2022-12-15 12:38:41 -05:00
|
|
|
if (!this.params.logging.includes("stats")) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2020-12-02 16:26:20 +00:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
const realSize = await this.crawlState.queueSize();
|
2024-05-21 10:58:35 -07:00
|
|
|
const pendingPages = await this.crawlState.getPendingList();
|
2022-12-15 12:38:41 -05:00
|
|
|
const done = await this.crawlState.numDone();
|
2023-04-13 16:31:33 -04:00
|
|
|
const failed = await this.crawlState.numFailed();
|
2024-05-21 10:58:35 -07:00
|
|
|
const total = realSize + pendingPages.length + done;
|
2023-11-09 11:27:11 -08:00
|
|
|
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
|
2022-12-15 12:38:41 -05:00
|
|
|
const stats = {
|
2023-11-09 11:27:11 -08:00
|
|
|
crawled: done,
|
|
|
|
|
total: total,
|
2024-05-21 10:58:35 -07:00
|
|
|
pending: pendingPages.length,
|
2023-11-09 11:27:11 -08:00
|
|
|
failed: failed,
|
|
|
|
|
limit: limit,
|
2024-05-21 10:58:35 -07:00
|
|
|
pendingPages,
|
2022-12-15 12:38:41 -05:00
|
|
|
};
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Crawl statistics", stats, "crawlStatus");
|
2023-04-26 15:41:35 -07:00
|
|
|
this.logMemory();
|
2022-12-15 12:38:41 -05:00
|
|
|
|
2023-09-15 00:16:19 +02:00
|
|
|
if (this.params.statsFilename) {
|
2020-12-02 16:26:20 +00:00
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
await fsp.writeFile(
|
|
|
|
|
this.params.statsFilename,
|
2023-11-09 19:11:11 -05:00
|
|
|
JSON.stringify(stats, null, 2),
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (err) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.warn("Stats output failed", err);
|
2020-12-02 16:26:20 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async loadPage(
|
|
|
|
|
page: Page,
|
|
|
|
|
data: PageState,
|
2023-11-09 19:11:11 -05:00
|
|
|
selectorOptsList = DEFAULT_SELECTORS,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2024-02-28 11:31:59 -08:00
|
|
|
const { url, depth } = data;
|
2023-02-23 18:50:22 -08:00
|
|
|
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
const logDetails = data.logDetails;
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const failCrawlOnError = depth === 0 && this.params.failOnFailedSeed;
|
2023-04-26 19:49:32 -04:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
// Attempt to load the page:
|
|
|
|
|
// - Already tried direct fetch w/o browser before getting here, and that resulted in an HTML page or non-200 response
|
|
|
|
|
// so now loading using the browser
|
|
|
|
|
// - If page.load() fails, but downloadResponse is set, then its a download, consider successful
|
|
|
|
|
// set page status to FULL_PAGE_LOADED (2)
|
|
|
|
|
// - If page.load() fails, but firstResponse is set to CONTENT_LOADED (1) state,
|
|
|
|
|
// consider a slow page, proceed to link extraction, but skip behaviors, issue warning
|
|
|
|
|
// - If page.load() fails otherwise and if failOnFailedSeed is set, fail crawl, otherwise fail page
|
|
|
|
|
// - If page.load() succeeds, check if page url is a chrome-error:// page, fail page (and or crawl if failOnFailedSeed and seed)
|
|
|
|
|
// - If at least one response, check if HTML, proceed with post-crawl actions only if HTML.
|
|
|
|
|
|
|
|
|
|
let downloadResponse: HTTPResponse | null = null;
|
|
|
|
|
let firstResponse: HTTPResponse | null = null;
|
|
|
|
|
let fullLoadedResponse: HTTPResponse | null = null;
|
|
|
|
|
|
|
|
|
|
// Detect if failure is actually caused by trying to load a non-page (eg. downloadable PDF),
|
|
|
|
|
// store the downloadResponse, if any
|
2023-11-09 11:27:11 -08:00
|
|
|
page.once("requestfailed", (req: HTTPRequest) => {
|
2024-06-26 09:16:24 -07:00
|
|
|
downloadResponse = getDownloadResponse(req);
|
2022-03-14 11:11:53 -07:00
|
|
|
});
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
// store the first successful non-redirect response, even if page doesn't load fully
|
|
|
|
|
const waitFirstResponse = (resp: HTTPResponse) => {
|
|
|
|
|
firstResponse = resp;
|
|
|
|
|
if (!isRedirectStatus(firstResponse.status())) {
|
|
|
|
|
// don't listen to any additional responses
|
|
|
|
|
page.off("response", waitFirstResponse);
|
|
|
|
|
}
|
|
|
|
|
};
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
page.on("response", waitFirstResponse);
|
|
|
|
|
|
|
|
|
|
// store that domcontentloaded was finished
|
|
|
|
|
page.once("domcontentloaded", () => {
|
|
|
|
|
data.loadState = LoadState.CONTENT_LOADED;
|
|
|
|
|
});
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const gotoOpts = data.isHTMLPage
|
2023-11-09 11:27:11 -08:00
|
|
|
? this.gotoOpts
|
|
|
|
|
: { waitUntil: "domcontentloaded" };
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Awaiting page load", logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2021-05-21 15:37:02 -07:00
|
|
|
try {
|
2024-06-26 09:16:24 -07:00
|
|
|
// store the page load response when page fully loads
|
|
|
|
|
fullLoadedResponse = await page.goto(url, gotoOpts);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
|
|
|
|
if (!(e instanceof Error)) {
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
2023-09-18 15:24:33 -07:00
|
|
|
const msg = e.message || "";
|
2024-06-26 09:16:24 -07:00
|
|
|
|
|
|
|
|
// got firstResponse and content loaded, not a failure
|
|
|
|
|
if (firstResponse && data.loadState == LoadState.CONTENT_LOADED) {
|
2023-09-18 15:24:33 -07:00
|
|
|
// if timeout error, and at least got to content loaded, continue on
|
2024-06-26 09:16:24 -07:00
|
|
|
logger.warn(
|
|
|
|
|
"Page load timed out, loading but slowly, skipping behaviors",
|
|
|
|
|
{
|
2023-11-09 11:27:11 -08:00
|
|
|
msg,
|
|
|
|
|
...logDetails,
|
2024-06-26 09:16:24 -07:00
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
data.skipBehaviors = true;
|
|
|
|
|
} else if (!downloadResponse) {
|
|
|
|
|
if (failCrawlOnError) {
|
2023-09-18 15:24:33 -07:00
|
|
|
// if fail on error, immediately fail here
|
2024-05-15 14:02:33 -04:00
|
|
|
logger.fatal(
|
|
|
|
|
"Page Load Timeout, failing crawl",
|
|
|
|
|
{
|
|
|
|
|
msg,
|
|
|
|
|
...logDetails,
|
|
|
|
|
},
|
|
|
|
|
"general",
|
|
|
|
|
1,
|
|
|
|
|
);
|
2024-06-26 09:16:24 -07:00
|
|
|
// log if not already log and rethrow, consider page failed
|
|
|
|
|
} else if (msg !== "logged") {
|
|
|
|
|
logger.error("Page Load Failed, skipping page", {
|
|
|
|
|
msg,
|
|
|
|
|
loadState: data.loadState,
|
|
|
|
|
...logDetails,
|
|
|
|
|
});
|
|
|
|
|
e.message = "logged";
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
}
|
2024-06-26 09:16:24 -07:00
|
|
|
throw e;
|
2022-03-14 11:11:53 -07:00
|
|
|
}
|
2021-05-21 15:37:02 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const resp = fullLoadedResponse || downloadResponse || firstResponse;
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!resp) {
|
|
|
|
|
throw new Error("no response for page load, assuming failed");
|
|
|
|
|
}
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const respUrl = resp.url();
|
|
|
|
|
const isChromeError = page.url().startsWith("chrome-error://");
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (depth === 0 && !isChromeError && respUrl !== url && !downloadResponse) {
|
|
|
|
|
data.seedId = await this.crawlState.addExtraSeed(
|
|
|
|
|
this.seeds,
|
|
|
|
|
this.numOriginalSeeds,
|
|
|
|
|
data.seedId,
|
|
|
|
|
respUrl,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2024-06-26 09:16:24 -07:00
|
|
|
logger.info("Seed page redirected, adding redirected seed", {
|
|
|
|
|
origUrl: url,
|
|
|
|
|
newUrl: respUrl,
|
|
|
|
|
seedId: data.seedId,
|
|
|
|
|
});
|
|
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const status = resp.status();
|
|
|
|
|
data.status = status;
|
|
|
|
|
|
|
|
|
|
let failed = isChromeError;
|
|
|
|
|
|
|
|
|
|
if (this.params.failOnInvalidStatus && status >= 400) {
|
|
|
|
|
// Handle 4xx or 5xx response as a page load error
|
|
|
|
|
failed = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (failed) {
|
|
|
|
|
if (failCrawlOnError) {
|
|
|
|
|
logger.fatal(
|
|
|
|
|
"Seed Page Load Error, failing crawl",
|
|
|
|
|
{
|
|
|
|
|
status,
|
2023-11-09 11:27:11 -08:00
|
|
|
...logDetails,
|
2024-06-26 09:16:24 -07:00
|
|
|
},
|
|
|
|
|
"general",
|
|
|
|
|
1,
|
|
|
|
|
);
|
|
|
|
|
} else {
|
|
|
|
|
logger.error(
|
|
|
|
|
isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
|
|
|
|
|
{
|
|
|
|
|
status,
|
|
|
|
|
...logDetails,
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
throw new Error("logged");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const contentType = resp.headers()["content-type"];
|
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking
- add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object.
- set loadState to 0 (failed) by default
- set loadState to 1 (content-loaded) on 'domcontentloaded' event
- if page.goto() finishes, set to loadState to 2 'full-page-load'.
- if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors
- page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out
- pages: log 'loadState' as part of pages.jsonl
- improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well
- screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting
- deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements)
- workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages
- worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state
- screencaster: allow reusing caster slots with fixed ids
- interrupt timedCrawlPage() wait if 'crash' event happens
- crawler: pageFinished() callback when page finishes
- worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
2023-03-20 18:31:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (contentType) {
|
|
|
|
|
data.mime = contentType.split(";")[0];
|
|
|
|
|
data.isHTMLPage = isHTMLMime(data.mime);
|
2023-03-08 21:31:19 -05:00
|
|
|
} else {
|
2024-06-26 09:16:24 -07:00
|
|
|
// guess that its html if it fully loaded as a page
|
|
|
|
|
data.isHTMLPage = !!fullLoadedResponse;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Full Page Loaded if:
|
|
|
|
|
// - it was a download response
|
|
|
|
|
// - page.load() succeeded
|
|
|
|
|
// but not:
|
|
|
|
|
// - if first response was received, but not fully loaded
|
|
|
|
|
if (fullLoadedResponse || downloadResponse) {
|
|
|
|
|
data.loadState = LoadState.FULL_PAGE_LOADED;
|
2023-03-08 21:31:19 -05:00
|
|
|
}
|
2022-03-22 17:41:51 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!data.isHTMLPage) {
|
|
|
|
|
data.filteredFrames = [];
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
"Non-HTML Page URL, skipping all post-crawl actions",
|
|
|
|
|
{ isDownload: !!downloadResponse, mime: data.mime, ...logDetails },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
2022-03-22 17:41:51 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
// HTML Pages Only here
|
|
|
|
|
const frames = await page.frames();
|
|
|
|
|
|
|
|
|
|
const filteredFrames = await Promise.allSettled(
|
|
|
|
|
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
data.filteredFrames = filteredFrames
|
|
|
|
|
.filter((x: PromiseSettledResult<Frame | null>) => {
|
|
|
|
|
if (x.status === "fulfilled") {
|
|
|
|
|
return !!x.value;
|
|
|
|
|
}
|
|
|
|
|
logger.warn("Error in iframe check", {
|
|
|
|
|
reason: x.reason,
|
|
|
|
|
...logDetails,
|
|
|
|
|
});
|
|
|
|
|
return false;
|
|
|
|
|
})
|
|
|
|
|
.map((x) => (x as PromiseFulfilledResult<Frame>).value);
|
|
|
|
|
|
|
|
|
|
//data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
|
|
|
|
|
|
2024-02-28 11:31:59 -08:00
|
|
|
const { seedId } = data;
|
|
|
|
|
|
2024-06-13 17:18:06 -07:00
|
|
|
const seed = await this.crawlState.getSeedAt(
|
|
|
|
|
this.seeds,
|
|
|
|
|
this.numOriginalSeeds,
|
|
|
|
|
seedId,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (!seed) {
|
|
|
|
|
logger.error(
|
|
|
|
|
"Seed not found, likely invalid crawl state - skipping link extraction and behaviors",
|
|
|
|
|
{ seedId, ...logDetails },
|
|
|
|
|
);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2021-07-20 15:45:51 -07:00
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.checkCF(page, logDetails);
|
2022-03-18 10:32:59 -07:00
|
|
|
|
2023-02-23 18:50:22 -08:00
|
|
|
await this.netIdle(page, logDetails);
|
2022-09-20 17:09:52 -07:00
|
|
|
|
2024-04-18 17:16:57 -07:00
|
|
|
await this.awaitPageLoad(page.mainFrame(), logDetails);
|
2024-03-28 17:17:29 -07:00
|
|
|
|
2021-07-20 15:45:51 -07:00
|
|
|
// skip extraction if at max depth
|
2021-07-23 18:31:43 -07:00
|
|
|
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
2023-08-31 13:42:14 -07:00
|
|
|
logger.debug("Skipping Link Extraction, At Max Depth");
|
2021-07-20 15:45:51 -07:00
|
|
|
return;
|
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2023-09-15 10:12:08 -07:00
|
|
|
logger.debug("Extracting links", logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-09-15 10:12:08 -07:00
|
|
|
await this.extractLinks(page, data, selectorOptsList, logDetails);
|
2021-07-23 18:31:43 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async netIdle(page: Page, details: LogDetails) {
|
2022-07-08 17:17:46 -07:00
|
|
|
if (!this.params.netIdleWait) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
// in case page starts loading via fetch/xhr immediately after page load,
|
|
|
|
|
// we want to ensure we don't exit too early
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(0.5);
|
2022-07-08 17:17:46 -07:00
|
|
|
|
|
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
await this.browser.waitForNetworkIdle(page, {
|
|
|
|
|
timeout: this.params.netIdleWait * 1000,
|
|
|
|
|
});
|
2022-07-08 17:17:46 -07:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("waitForNetworkIdle timed out, ignoring", details);
|
2022-07-08 17:17:46 -07:00
|
|
|
// ignore, continue
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-18 17:16:57 -07:00
|
|
|
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Waiting for custom page load via behavior",
|
|
|
|
|
logDetails,
|
|
|
|
|
"behavior",
|
|
|
|
|
);
|
|
|
|
|
try {
|
2024-05-24 14:51:51 -07:00
|
|
|
await frame.evaluate(
|
|
|
|
|
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
|
|
|
|
|
);
|
2024-04-18 17:16:57 -07:00
|
|
|
} catch (e) {
|
|
|
|
|
logger.warn("Waiting for custom page load failed", e, "behavior");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.params.postLoadDelay) {
|
|
|
|
|
logger.info("Awaiting post load delay", {
|
|
|
|
|
seconds: this.params.postLoadDelay,
|
|
|
|
|
});
|
|
|
|
|
await sleep(this.params.postLoadDelay);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async extractLinks(
|
|
|
|
|
page: Page,
|
|
|
|
|
data: PageState,
|
|
|
|
|
selectors = DEFAULT_SELECTORS,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
|
|
|
|
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
2021-07-23 18:31:43 -07:00
|
|
|
|
2024-02-28 22:56:32 -08:00
|
|
|
callbacks.addLink = async (url: string) => {
|
2024-07-11 19:48:43 -07:00
|
|
|
await this.queueInScopeUrls(
|
|
|
|
|
seedId,
|
|
|
|
|
[url],
|
|
|
|
|
depth,
|
|
|
|
|
extraHops,
|
|
|
|
|
false,
|
|
|
|
|
logDetails,
|
|
|
|
|
);
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
};
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const loadLinks = (options: {
|
|
|
|
|
selector: string;
|
|
|
|
|
extract: string;
|
|
|
|
|
isAttribute: boolean;
|
|
|
|
|
addLinkFunc: string;
|
|
|
|
|
}) => {
|
2023-09-15 10:12:08 -07:00
|
|
|
const { selector, extract, isAttribute, addLinkFunc } = options;
|
2023-11-09 11:27:11 -08:00
|
|
|
const urls = new Set<string>();
|
2023-09-15 10:12:08 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
const getAttr = (elem: any) => urls.add(elem.getAttribute(extract));
|
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
const getProp = (elem: any) => urls.add(elem[extract]);
|
2023-09-15 10:12:08 -07:00
|
|
|
|
|
|
|
|
const getter = isAttribute ? getAttr : getProp;
|
|
|
|
|
|
|
|
|
|
document.querySelectorAll(selector).forEach(getter);
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
2023-11-09 19:11:11 -05:00
|
|
|
const func = (window as any)[addLinkFunc] as (
|
|
|
|
|
url: string,
|
|
|
|
|
) => NonNullable<unknown>;
|
2023-11-09 11:27:11 -08:00
|
|
|
urls.forEach((url) => func.call(this, url));
|
2023-09-15 10:12:08 -07:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const frames = filteredFrames || page.frames();
|
2023-03-08 21:31:19 -05:00
|
|
|
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
for (const {
|
|
|
|
|
selector = "a[href]",
|
|
|
|
|
extract = "href",
|
|
|
|
|
isAttribute = false,
|
|
|
|
|
} of selectors) {
|
2023-09-15 10:12:08 -07:00
|
|
|
const promiseResults = await Promise.allSettled(
|
2023-11-09 11:27:11 -08:00
|
|
|
frames.map((frame) =>
|
|
|
|
|
timedRun(
|
|
|
|
|
frame.evaluate(loadLinks, {
|
|
|
|
|
selector,
|
|
|
|
|
extract,
|
|
|
|
|
isAttribute,
|
|
|
|
|
addLinkFunc: ADD_LINK_FUNC,
|
|
|
|
|
}),
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Link extraction timed out",
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
|
|
|
|
),
|
|
|
|
|
),
|
2023-09-15 10:12:08 -07:00
|
|
|
);
|
2021-07-20 15:45:51 -07:00
|
|
|
|
2023-09-15 10:12:08 -07:00
|
|
|
for (let i = 0; i < promiseResults.length; i++) {
|
2023-11-09 11:27:11 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
|
const { status, reason } = promiseResults[i] as any;
|
2023-09-15 10:12:08 -07:00
|
|
|
if (status === "rejected") {
|
2023-11-09 11:27:11 -08:00
|
|
|
logger.warn("Link Extraction failed in frame", {
|
|
|
|
|
reason,
|
|
|
|
|
frameUrl: frames[i].url,
|
|
|
|
|
...logDetails,
|
|
|
|
|
});
|
2021-07-20 15:45:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
|
|
|
|
logger.warn("Link Extraction failed", e, "links");
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async queueInScopeUrls(
|
|
|
|
|
seedId: number,
|
|
|
|
|
urls: string[],
|
|
|
|
|
depth: number,
|
|
|
|
|
extraHops = 0,
|
2024-07-11 19:48:43 -07:00
|
|
|
noOOS = false,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails = {},
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
depth += 1;
|
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
// new number of extra hops, set if this hop is out-of-scope (oos)
|
|
|
|
|
const newExtraHops = extraHops + 1;
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
for (const possibleUrl of urls) {
|
2024-06-18 16:11:48 -07:00
|
|
|
const res = this.getScope(
|
2024-07-11 19:48:43 -07:00
|
|
|
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId, noOOS },
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2022-01-15 09:03:09 -08:00
|
|
|
|
|
|
|
|
if (!res) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const { url, isOOS } = res;
|
2022-01-15 09:03:09 -08:00
|
|
|
|
|
|
|
|
if (url) {
|
2023-11-09 11:27:11 -08:00
|
|
|
await this.queueUrl(
|
|
|
|
|
seedId,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
isOOS ? newExtraHops : extraHops,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
|
|
|
|
logger.error("Queuing Error", e, "links");
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async checkCF(page: Page, logDetails: LogDetails) {
|
2022-03-18 10:32:59 -07:00
|
|
|
try {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug("Check CF Blocking", logDetails);
|
2023-03-08 21:31:19 -05:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
while (
|
|
|
|
|
await timedRun(
|
|
|
|
|
page.$("div.cf-browser-verification.cf-im-under-attack"),
|
|
|
|
|
PAGE_OP_TIMEOUT_SECS,
|
|
|
|
|
"Cloudflare check timed out",
|
|
|
|
|
logDetails,
|
|
|
|
|
"general",
|
2023-11-09 19:11:11 -05:00
|
|
|
true,
|
2023-11-09 11:27:11 -08:00
|
|
|
)
|
|
|
|
|
) {
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Cloudflare Check Detected, waiting for reload...",
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-03-13 14:48:04 -07:00
|
|
|
await sleep(5.5);
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
//logger.warn("Check CF failed, ignoring");
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async queueUrl(
|
|
|
|
|
seedId: number,
|
|
|
|
|
url: string,
|
|
|
|
|
depth: number,
|
|
|
|
|
extraHops: number,
|
2023-11-09 19:11:11 -05:00
|
|
|
logDetails: LogDetails = {},
|
2024-03-22 17:32:42 -07:00
|
|
|
ts = 0,
|
|
|
|
|
pageid?: string,
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (this.limitHit) {
|
Per-Seed Scoping Rules + Crawl Depth (#63)
* scoped seeds:
- support per-seed scoping (include + exclude), allowHash, depth, and sitemap options
- support maxDepth per seed #16
- combine --url, --seed and --urlFile/--seedFile urls into a unified seed list
arg parsing:
- simplify seed file options into --seedFile/--urlFile, move option in help display
- rename --maxDepth -> --depth, supported globally and per seed
- ensure custom parsed params from argParser passed back correctly (behaviors, logging, device emulation)
- update to latest js-yaml
- rename --yamlConfig -> --config
- config: support reading config from stdin if --config set to 'stdin'
* scope: fix typo in 'prefix' scope
* update browsertrix-behaviors to 0.2.2
* tests: add test for passing config via stdin, also adding --excludes via cmdline
* update README:
- latest cli, add docs on config via stdin
- rename --yamlConfig -> --config, consolidate --seedFile/--urlFile, move arg position
- info on scoped seeds
- list current scope types
2021-06-26 13:11:29 -07:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const result = await this.crawlState.addToQueue(
|
2024-03-22 17:32:42 -07:00
|
|
|
{ url, seedId, depth, extraHops, ts, pageid },
|
2023-11-09 19:11:11 -05:00
|
|
|
this.pageLimit,
|
2023-11-09 11:27:11 -08:00
|
|
|
);
|
2023-09-15 10:12:08 -07:00
|
|
|
|
|
|
|
|
switch (result) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case QueueState.ADDED:
|
|
|
|
|
logger.debug("Queued new page url", { url, ...logDetails }, "links");
|
|
|
|
|
return true;
|
2023-09-15 10:12:08 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case QueueState.LIMIT_HIT:
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Not queued page url, at page limit",
|
|
|
|
|
{ url, ...logDetails },
|
|
|
|
|
"links",
|
|
|
|
|
);
|
|
|
|
|
this.limitHit = true;
|
|
|
|
|
return false;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case QueueState.DUPE_URL:
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Not queued page url, already seen",
|
|
|
|
|
{ url, ...logDetails },
|
|
|
|
|
"links",
|
|
|
|
|
);
|
|
|
|
|
return false;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
|
|
2023-09-15 10:12:08 -07:00
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
async initPages(filename: string, title: string) {
|
|
|
|
|
let fh = null;
|
|
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2024-04-11 13:55:52 -07:00
|
|
|
await fsp.mkdir(this.pagesDir, { recursive: true });
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
const createNew = !fs.existsSync(filename);
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
fh = fs.createWriteStream(filename, { flags: "a" });
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
|
if (createNew) {
|
2023-11-09 11:27:11 -08:00
|
|
|
const header: Record<string, string> = {
|
|
|
|
|
format: "json-pages-1.0",
|
|
|
|
|
id: "pages",
|
2024-04-11 13:55:52 -07:00
|
|
|
title,
|
2023-11-09 11:27:11 -08:00
|
|
|
};
|
2024-04-11 13:55:52 -07:00
|
|
|
header.hasText = this.params.text.includes("to-pages");
|
2023-10-31 23:05:30 -07:00
|
|
|
if (this.params.text.length) {
|
|
|
|
|
logger.debug("Text Extraction: " + this.params.text.join(","));
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
} else {
|
2023-10-31 23:05:30 -07:00
|
|
|
logger.debug("Text Extraction: None");
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2024-04-11 13:55:52 -07:00
|
|
|
await fh.write(JSON.stringify(header) + "\n");
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (err) {
|
2024-04-11 13:55:52 -07:00
|
|
|
logger.error(`"${filename}" creation failed`, err);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
2024-04-11 13:55:52 -07:00
|
|
|
return fh;
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
protected pageEntryForRedis(
|
|
|
|
|
entry: Record<string, string | number | boolean | object>,
|
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
|
|
|
state: PageState,
|
|
|
|
|
) {
|
|
|
|
|
return entry;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async writePage(state: PageState) {
|
|
|
|
|
const {
|
|
|
|
|
pageid,
|
|
|
|
|
url,
|
|
|
|
|
depth,
|
|
|
|
|
title,
|
|
|
|
|
text,
|
|
|
|
|
loadState,
|
|
|
|
|
mime,
|
|
|
|
|
favicon,
|
|
|
|
|
status,
|
|
|
|
|
} = state;
|
|
|
|
|
|
|
|
|
|
const row: PageEntry = { id: pageid, url, title, loadState };
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
let { ts } = state;
|
2024-03-22 13:37:14 -07:00
|
|
|
if (!ts) {
|
|
|
|
|
ts = new Date();
|
2024-06-07 10:34:19 -07:00
|
|
|
if (!this.params.dryRun) {
|
|
|
|
|
logger.warn(
|
|
|
|
|
"Page date missing, setting to now",
|
|
|
|
|
{ url, ts },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
|
|
|
|
}
|
2024-02-09 19:44:17 -05:00
|
|
|
}
|
|
|
|
|
|
2024-03-22 13:37:14 -07:00
|
|
|
row.ts = ts.toISOString();
|
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
if (mime) {
|
|
|
|
|
row.mime = mime;
|
|
|
|
|
}
|
2021-02-23 16:52:54 -05:00
|
|
|
|
2024-02-28 22:56:12 -08:00
|
|
|
if (status) {
|
|
|
|
|
row.status = status;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-09 19:44:17 -05:00
|
|
|
if (this.params.writePagesToRedis) {
|
2024-03-22 17:32:42 -07:00
|
|
|
await this.crawlState.writeToPagesQueue(
|
|
|
|
|
JSON.stringify(this.pageEntryForRedis(row, state)),
|
|
|
|
|
);
|
2024-02-09 19:44:17 -05:00
|
|
|
}
|
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
if (depth === 0) {
|
|
|
|
|
row.seed = true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
if (text && this.textInPages) {
|
2022-03-14 10:41:56 -07:00
|
|
|
row.text = text;
|
2021-02-23 16:52:54 -05:00
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2024-03-22 13:37:14 -07:00
|
|
|
if (favicon) {
|
2023-09-10 11:29:35 -07:00
|
|
|
row.favIconUrl = favicon;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const processedRow = JSON.stringify(row) + "\n";
|
2024-04-11 13:55:52 -07:00
|
|
|
|
|
|
|
|
const pagesFH = depth > 0 ? this.extraPagesFH : this.pagesFH;
|
|
|
|
|
|
|
|
|
|
if (!pagesFH) {
|
|
|
|
|
logger.error("Can't write pages, missing stream", {}, "pageStatus");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
try {
|
2024-04-11 13:55:52 -07:00
|
|
|
await pagesFH.write(processedRow);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (err) {
|
2024-04-11 13:55:52 -07:00
|
|
|
logger.warn(
|
|
|
|
|
"Page append failed",
|
|
|
|
|
{ pagesFile: depth > 0 ? this.otherPagesFile : this.seedPagesFile },
|
|
|
|
|
"pageStatus",
|
|
|
|
|
);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
|
}
|
2021-06-23 19:36:32 -07:00
|
|
|
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
|
|
|
|
|
if (!sitemap) {
|
|
|
|
|
return;
|
2023-09-13 13:20:41 -04:00
|
|
|
}
|
|
|
|
|
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
if (await this.crawlState.isSitemapDone()) {
|
|
|
|
|
logger.info("Sitemap already processed, skipping", "sitemap");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const fromDate = this.params.sitemapFromDate;
|
|
|
|
|
const toDate = this.params.sitemapToDate;
|
|
|
|
|
const headers = this.headers;
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
"Fetching sitemap",
|
|
|
|
|
{ from: fromDate || "<any date>", to: fromDate || "<any date>" },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
const sitemapper = new SitemapReader({
|
|
|
|
|
headers,
|
|
|
|
|
fromDate,
|
|
|
|
|
toDate,
|
|
|
|
|
limit: this.pageLimit,
|
2020-11-14 21:55:02 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
try {
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
await sitemapper.parse(sitemap, url);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
logger.warn(
|
|
|
|
|
"Sitemap for seed failed",
|
|
|
|
|
{ url, sitemap, ...formatErr(e) },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
return;
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
|
|
|
|
|
let power = 1;
|
|
|
|
|
let resolved = false;
|
|
|
|
|
|
|
|
|
|
let finished = false;
|
|
|
|
|
|
|
|
|
|
await new Promise<void>((resolve) => {
|
|
|
|
|
sitemapper.on("end", () => {
|
|
|
|
|
resolve();
|
|
|
|
|
if (!finished) {
|
|
|
|
|
logger.info(
|
|
|
|
|
"Sitemap Parsing Finished",
|
|
|
|
|
{ urlsFound: sitemapper.count, limitHit: sitemapper.atLimit() },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
this.crawlState.markSitemapDone();
|
|
|
|
|
finished = true;
|
|
|
|
|
}
|
|
|
|
|
});
|
2024-03-26 14:50:36 -07:00
|
|
|
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
sitemapper.on("url", ({ url }) => {
|
|
|
|
|
const count = sitemapper.count;
|
|
|
|
|
if (count % 10 ** power === 0) {
|
|
|
|
|
if (count % 10 ** (power + 1) === 0 && power <= 3) {
|
|
|
|
|
power++;
|
|
|
|
|
}
|
|
|
|
|
const sitemapsQueued = sitemapper.getSitemapsQueued();
|
|
|
|
|
logger.debug(
|
|
|
|
|
"Sitemap URLs processed so far",
|
|
|
|
|
{ count, sitemapsQueued },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
}
|
2024-07-11 19:48:43 -07:00
|
|
|
this.queueInScopeUrls(seedId, [url], 0, 0, true);
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
if (count >= 100 && !resolved) {
|
|
|
|
|
logger.info(
|
|
|
|
|
"Sitemap partially parsed, continue parsing large sitemap in the background",
|
|
|
|
|
{ urlsFound: count },
|
|
|
|
|
"sitemap",
|
|
|
|
|
);
|
|
|
|
|
resolve();
|
|
|
|
|
resolved = true;
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
});
|
2020-11-14 21:55:02 +00:00
|
|
|
}
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
async combineWARC() {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info("Generating Combined WARCs");
|
2023-05-03 16:25:59 -07:00
|
|
|
await this.crawlState.setStatus("generate-warc");
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
// Get the list of created Warcs
|
2024-03-22 17:32:42 -07:00
|
|
|
const warcLists = await fsp.readdir(this.archivesDir);
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Combining ${warcLists.length} WARCs...`);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
|
|
|
|
|
|
|
|
|
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
|
|
|
|
|
for (let i = 0; i < warcLists.length; i++) {
|
2024-03-22 17:32:42 -07:00
|
|
|
const fileName = path.join(this.archivesDir, warcLists[i]);
|
2022-02-08 15:31:55 -08:00
|
|
|
const fileSize = await getFileSize(fileName);
|
2023-11-09 11:27:11 -08:00
|
|
|
fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const generatedCombinedWarcs = [];
|
|
|
|
|
|
|
|
|
|
// Used to name combined warcs, default to -1 for first increment
|
|
|
|
|
let combinedWarcNumber = -1;
|
|
|
|
|
|
|
|
|
|
// write combine WARC to collection root
|
|
|
|
|
let combinedWarcFullPath = "";
|
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
// fileHandler
|
|
|
|
|
let fh = null;
|
|
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
// Iterate through the sorted file size array.
|
|
|
|
|
for (let j = 0; j < fileSizeObjects.length; j++) {
|
|
|
|
|
// if need to rollover to new warc
|
|
|
|
|
let doRollover = false;
|
|
|
|
|
|
|
|
|
|
// set to true for first warc
|
|
|
|
|
if (combinedWarcNumber < 0) {
|
|
|
|
|
doRollover = true;
|
|
|
|
|
} else {
|
|
|
|
|
// Check the size of the existing combined warc.
|
2022-02-08 15:31:55 -08:00
|
|
|
const currentCombinedWarcSize = await getFileSize(combinedWarcFullPath);
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
// If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
|
2023-11-09 11:27:11 -08:00
|
|
|
const proposedWarcSize =
|
|
|
|
|
fileSizeObjects[j].fileSize + currentCombinedWarcSize;
|
2021-03-31 13:41:27 -04:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
doRollover = proposedWarcSize >= this.params.rolloverSize;
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (doRollover) {
|
2021-06-23 19:36:32 -07:00
|
|
|
// If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
|
2021-03-31 13:41:27 -04:00
|
|
|
// 1. increment the combinedWarcNumber
|
|
|
|
|
// 2. create the name of the new combinedWarcFile
|
|
|
|
|
// 3. Write the header out to the new file
|
|
|
|
|
// 4. Write out the current warc data to the combinedFile
|
|
|
|
|
combinedWarcNumber = combinedWarcNumber + 1;
|
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
const combinedWarcName = `${this.params.collection}_${combinedWarcNumber}.warc.gz`;
|
2021-03-31 13:41:27 -04:00
|
|
|
|
|
|
|
|
// write combined warcs to root collection dir as they're output of a collection (like wacz)
|
|
|
|
|
combinedWarcFullPath = path.join(this.collDir, combinedWarcName);
|
|
|
|
|
|
2021-04-29 14:34:56 -07:00
|
|
|
if (fh) {
|
|
|
|
|
fh.end();
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
fh = fs.createWriteStream(combinedWarcFullPath, { flags: "a" });
|
2021-04-29 14:34:56 -07:00
|
|
|
|
2021-03-31 13:41:27 -04:00
|
|
|
generatedCombinedWarcs.push(combinedWarcName);
|
|
|
|
|
|
2024-05-22 15:47:05 -07:00
|
|
|
const warcBuffer = await createWARCInfo(combinedWarcName);
|
2021-04-29 14:34:56 -07:00
|
|
|
fh.write(warcBuffer);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
|
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const p = new Promise<void>((resolve) => {
|
2021-04-29 14:34:56 -07:00
|
|
|
reader.on("end", () => resolve());
|
|
|
|
|
});
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (fh) {
|
|
|
|
|
reader.pipe(fh, { end: false });
|
|
|
|
|
}
|
2021-04-29 14:34:56 -07:00
|
|
|
|
|
|
|
|
await p;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (fh) {
|
|
|
|
|
await fh.end();
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
2021-03-31 13:41:27 -04:00
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
async serializeConfig(done = false) {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
switch (this.params.saveState) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case "never":
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
case "partial":
|
|
|
|
|
if (!done) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (await this.crawlState.isFinished()) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case "always":
|
|
|
|
|
default:
|
|
|
|
|
break;
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
|
|
2022-03-14 10:41:56 -07:00
|
|
|
const now = new Date();
|
|
|
|
|
|
|
|
|
|
if (!done) {
|
|
|
|
|
// if not done, save state only after specified interval has elapsed
|
2023-11-09 11:27:11 -08:00
|
|
|
if (
|
|
|
|
|
secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval
|
|
|
|
|
) {
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.lastSaveTime = now.getTime();
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const ts = now.toISOString().slice(0, 19).replace(/[T:-]/g, "");
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
|
const crawlDir = path.join(this.collDir, "crawls");
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
await fsp.mkdir(crawlDir, { recursive: true });
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
const filenameOnly = `crawl-${ts}-${this.params.crawlId}.yaml`;
|
|
|
|
|
|
|
|
|
|
const filename = path.join(crawlDir, filenameOnly);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
|
const state = await this.crawlState.serialize();
|
|
|
|
|
|
|
|
|
|
if (this.origConfig) {
|
|
|
|
|
this.origConfig.state = state;
|
|
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
const res = yaml.dump(this.origConfig, { lineWidth: -1 });
|
2022-03-14 10:41:56 -07:00
|
|
|
try {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Saving crawl state to: ${filename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
await fsp.writeFile(filename, res);
|
2023-11-14 21:54:40 -08:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Failed to write save state file: ${filename}`, e);
|
2022-03-14 10:41:56 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.saveStateFiles.push(filename);
|
|
|
|
|
|
|
|
|
|
if (this.saveStateFiles.length > this.params.saveStateHistory) {
|
|
|
|
|
const oldFilename = this.saveStateFiles.shift();
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.info(`Removing old save-state: ${oldFilename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
try {
|
2023-11-09 11:27:11 -08:00
|
|
|
await fsp.unlink(oldFilename || "");
|
2022-03-14 10:41:56 -07:00
|
|
|
} catch (e) {
|
2023-03-17 14:24:44 -07:00
|
|
|
logger.error(`Failed to delete old save state file: ${oldFilename}`);
|
2022-03-14 10:41:56 -07:00
|
|
|
}
|
|
|
|
|
}
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
|
|
|
|
if (this.storage && done && this.params.saveState === "always") {
|
|
|
|
|
const targetFilename = interpolateFilename(filenameOnly, this.crawlId);
|
|
|
|
|
|
|
|
|
|
await this.storage.uploadFile(filename, targetFilename);
|
|
|
|
|
}
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
2024-03-22 17:32:42 -07:00
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
getWarcPrefix(defaultValue = "") {
|
|
|
|
|
let warcPrefix =
|
|
|
|
|
process.env.WARC_PREFIX || this.params.warcPrefix || defaultValue;
|
|
|
|
|
|
|
|
|
|
if (warcPrefix) {
|
|
|
|
|
warcPrefix += "-" + this.crawlId + "-";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return warcPrefix;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
createExtraResourceWarcWriter(resourceName: string, gzip = true) {
|
2024-04-15 13:43:08 -07:00
|
|
|
const filenameBase = `${this.getWarcPrefix()}${resourceName}-$ts`;
|
2024-03-26 14:54:27 -07:00
|
|
|
|
|
|
|
|
return this.createWarcWriter(filenameBase, gzip, { resourceName });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
createWarcWriter(
|
|
|
|
|
filenameBase: string,
|
|
|
|
|
gzip: boolean,
|
|
|
|
|
logDetails: Record<string, string>,
|
|
|
|
|
) {
|
|
|
|
|
const filenameTemplate = `${filenameBase}.warc${gzip ? ".gz" : ""}`;
|
|
|
|
|
|
|
|
|
|
return new WARCWriter({
|
|
|
|
|
archivesDir: this.archivesDir,
|
|
|
|
|
tempCdxDir: this.tempCdxDir,
|
|
|
|
|
filenameTemplate,
|
|
|
|
|
rolloverSize: this.params.rolloverSize,
|
|
|
|
|
gzip,
|
|
|
|
|
logDetails,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
createRecorder(id: number): Recorder | null {
|
|
|
|
|
if (!this.recording) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
const filenameBase = `${this.getWarcPrefix("rec")}$ts-${id}`;
|
|
|
|
|
|
|
|
|
|
const writer = this.createWarcWriter(filenameBase, true, {
|
|
|
|
|
id: id.toString(),
|
|
|
|
|
});
|
|
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
const res = new Recorder({
|
|
|
|
|
workerid: id,
|
|
|
|
|
crawler: this,
|
2024-03-26 14:54:27 -07:00
|
|
|
writer,
|
|
|
|
|
tempdir: this.tempdir,
|
2024-03-22 17:32:42 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
this.browser.recorders.push(res);
|
|
|
|
|
return res;
|
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
function getDownloadResponse(req: HTTPRequest) {
|
2022-03-14 14:41:39 -07:00
|
|
|
try {
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!req.isNavigationRequest()) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const failure = req.failure();
|
2023-11-09 19:11:11 -05:00
|
|
|
const failureText = (failure && failure.errorText) || "";
|
|
|
|
|
if (
|
|
|
|
|
failureText !== "net::ERR_ABORTED" ||
|
|
|
|
|
req.resourceType() !== "document"
|
|
|
|
|
) {
|
2024-06-26 09:16:24 -07:00
|
|
|
return null;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const resp = req.response();
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
if (!resp) {
|
|
|
|
|
return null;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
const headers = resp.headers();
|
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
|
headers["content-disposition"] ||
|
2024-06-26 09:16:24 -07:00
|
|
|
(headers["content-type"] && !isHTMLMime(headers["content-type"]))
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
2024-06-26 09:16:24 -07:00
|
|
|
return resp;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
|
|
|
|
} catch (e) {
|
2024-06-26 09:16:24 -07:00
|
|
|
console.log(e);
|
|
|
|
|
// ignore
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
return null;
|
2022-03-14 14:41:39 -07:00
|
|
|
}
|