browsertrix-crawler/util/browser.js

340 lines
8.4 KiB
JavaScript
Raw Normal View History

Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
import * as child_process from "child_process";
import fs from "fs";
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
import { pipeline } from "node:stream/promises";
import { Readable } from "node:stream";
import os from "os";
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
import path from "path";
import { logger } from "./logger.js";
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
import { initStorage } from "./storage.js";
import { chromium } from "playwright-core";
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
// ==================================================================
export class Browser
{
constructor() {
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
this.customProfile = false;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
}
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {viewport: null}} = {}) {
if (this.isLaunched()) {
return;
}
if (profileUrl) {
this.customProfile = await this.loadProfile(profileUrl);
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
const args = this.chromeArgs(chromeOptions);
const launchOpts = {
...emulateDevice,
args,
headless,
executablePath: this.getBrowserExe(),
ignoreDefaultArgs: ["--enable-automation"],
ignoreHTTPSErrors: true,
handleSIGHUP: signals,
handleSIGINT: signals,
handleSIGTERM: signals,
serviceWorkers: "allow"
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
};
await this._init(launchOpts);
}
async setupPage({page, cdp}) {
await page.addInitScript("Object.defineProperty(navigator, \"webdriver\", {value: false});");
if (this.customProfile) {
logger.info("Disabling Service Workers for profile", {}, "browser");
await cdp.send("Network.setBypassServiceWorker", {bypass: true});
}
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
async getFirstPageWithCDP() {
return {page: this.firstPage, cdp: this.firstCDP};
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
async loadProfile(profileFilename) {
const targetFilename = "/tmp/profile.tar.gz";
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
if (profileFilename &&
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
const resp = await fetch(profileFilename);
await pipeline(
Readable.fromWeb(resp.body),
fs.createWriteStream(targetFilename)
);
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
profileFilename = targetFilename;
} else if (profileFilename && profileFilename.startsWith("@")) {
const storage = initStorage("");
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
if (!storage) {
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await storage.downloadFile(profileFilename.slice(1), targetFilename);
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
profileFilename = targetFilename;
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
if (profileFilename) {
try {
child_process.execSync("tar xvfz " + profileFilename, {cwd: this.profileDir});
return true;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
} catch (e) {
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
}
}
return false;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
saveProfile(profileFilename) {
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
chromeArgs({proxy=true, userAgent=null, extraArgs=[]} = {}) {
// Chrome Flags, including proxy server
const args = [
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
//"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
"--no-sandbox",
"--disable-background-media-suspend",
"--remote-debugging-port=9221",
"--autoplay-policy=no-user-gesture-required",
"--disable-site-isolation-trials",
`--user-agent=${userAgent || this.getDefaultUA()}`,
...extraArgs,
];
if (proxy) {
args.push("--ignore-certificate-errors");
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
return args;
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
getDefaultUA() {
let version = process.env.BROWSER_VERSION;
try {
version = child_process.execFileSync(this.getBrowserExe(), ["--version"], {encoding: "utf8"});
version = version.match(/[\d.]+/)[0];
} catch(e) {
console.error(e);
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
getBrowserExe() {
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
for (const file of files) {
if (file && fs.existsSync(file)) {
return file;
}
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
return null;
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
async evaluateWithCLI(context, frame, funcString, logData, contextName) {
let details = {frameUrl: frame.url(), ...logData};
logger.info("Run Script Started", details, contextName);
const cdp = await context.newCDPSession(frame);
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
//const contextId = context._contextId;
const expression = funcString + "\n//# sourceURL=__playwright_evaluation_script__";
const { exceptionDetails, result } = await cdp
.send("Runtime.evaluate", {
expression,
//contextId,
returnByValue: true,
awaitPromise: true,
userGesture: true,
includeCommandLineAPI: true,
});
if (exceptionDetails) {
if (exceptionDetails.stackTrace) {
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details};
}
logger.error("Run Script Failed", details, contextName);
} else {
logger.info("Run Script Finished", details, contextName);
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
try {
await cdp.detach();
} catch (e) {
logger.warn("Detach failed", details, contextName);
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
return result.value;
}
}
// ==================================================================
export class NewContextBrowser extends Browser
{
constructor() {
super();
this.browser = null;
this.contexts = new Map();
this.launchOpts = null;
}
isLaunched() {
if (this.browser) {
logger.warn("Browser already inited", {}, "browser");
return true;
}
return false;
}
async close() {
for (const context of this.contexts.values()) {
await context.close();
}
this.contexts.clear();
if (this.browser) {
await this.browser.close();
this.browser = null;
}
}
async closePage(page) {
try {
await page.close();
} catch (e) {
// ignore
}
const context = this.contexts.get(page);
if (context) {
await context.close();
this.contexts.delete(page);
}
}
async _init(launchOpts) {
if (this.customProfile) {
throw new Error("not supported yet");
}
this.browser = await chromium.launch(launchOpts);
this.launchOpts = launchOpts;
}
numPages() {
return this.contexts.length;
}
async newWindowPageWithCDP(storageState) {
const context = await this.browser.newContext({...this.launchOpts, storageState});
const page = await context.newPage();
const cdp = await context.newCDPSession(page);
this.contexts.set(page, context);
return {page, cdp};
}
}
// ==================================================================
export class PersistentContextBrowser extends Browser
{
constructor() {
super();
this.context = null;
this.firstPage = null;
this.firstCDP = null;
}
isLaunched() {
if (this.context) {
logger.warn("Context already inited", {}, "context");
return true;
}
return false;
}
async close() {
if (this.context) {
await this.context.close();
this.context = null;
}
}
async closePage(page) {
await page.close();
}
async _init(launchOpts) {
this.context = await chromium.launchPersistentContext(this.profileDir, launchOpts);
if (this.context.pages().length) {
this.firstPage = this.context.pages()[0];
} else {
this.firstPage = await this.context.newPage();
}
this.firstCDP = await this.context.newCDPSession(this.firstPage);
}
numPages() {
return this.context ? this.context.pages().length : 0;
}
async newWindowPageWithCDP() {
// unique url to detect new pages
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
const p = new Promise((resolve) => {
const listener = (page) => {
if (page.url() === startPage) {
resolve(page);
this.context.removeListener("page", listener);
}
};
this.context.on("page", listener);
});
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
const page = await p;
const cdp = await this.context.newCDPSession(page);
return {page, cdp};
}
}