browsertrix-crawler/src/create-login-profile.ts

705 lines
18 KiB
TypeScript
Raw Normal View History

#!/usr/bin/env node
import fs from "fs";
import path from "path";
import http, { IncomingMessage, ServerResponse } from "http";
import readline from "readline";
import child_process from "child_process";
import yargs, { Options } from "yargs";
import { logger } from "./util/logger.js";
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js";
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
import { getInfoString } from "./util/file_reader.js";
import { DISPLAY } from "./util/constants.js";
const profileHTML = fs.readFileSync(
new URL("../html/createProfile.html", import.meta.url),
{ encoding: "utf8" },
);
const vncHTML = fs.readFileSync(
new URL("../html/vnc_lite.html", import.meta.url),
{ encoding: "utf8" },
);
const behaviors = fs.readFileSync(
new URL(
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
import.meta.url,
),
{ encoding: "utf8" },
);
function cliOpts(): { [key: string]: Options } {
return {
url: {
describe: "The URL of the login page",
type: "string",
demandOption: true,
},
user: {
describe:
"The username for the login. If not specified, will be prompted",
},
password: {
describe:
"The password for the login. If not specified, will be prompted (recommended)",
},
filename: {
describe:
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
default: "/crawls/profiles/profile.tar.gz",
},
debugScreenshot: {
describe:
"If specified, take a screenshot after login and save as this filename",
},
headless: {
describe: "Run in headless mode, otherwise start xvfb",
type: "boolean",
default: false,
},
automated: {
describe: "Start in automated mode, no interactive browser",
type: "boolean",
default: false,
},
interactive: {
describe: "Deprecated. Now the default option!",
type: "boolean",
default: false,
},
shutdownWait: {
describe:
"Shutdown browser in interactive after this many seconds, if no pings received",
type: "number",
default: 0,
},
profile: {
describe:
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory",
type: "string",
},
windowSize: {
type: "string",
describe: "Browser window dimensions, specified as: width,height",
default: getDefaultWindowSize(),
},
proxyServer: {
describe:
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
type: "string",
},
cookieDays: {
type: "number",
describe:
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
default: 7,
},
};
}
function getDefaultWindowSize() {
const values = (process.env.GEOMETRY || "").split("x");
const x = Number(values[0]);
const y = Number(values[1]);
return `${x},${y}`;
}
function handleTerminate(signame: string) {
logger.info(`Got signal ${signame}, exiting`);
process.exit(1);
}
async function main() {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const params: any = yargs(process.argv)
.usage("browsertrix-crawler profile [options]")
.option(cliOpts()).argv;
logger.setDebugLogging(true);
logger.info(await getInfoString());
process.on("SIGINT", () => handleTerminate("SIGINT"));
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
if (!params.headless) {
logger.debug("Launching XVFB");
child_process.spawn("Xvfb", [
DISPLAY,
"-listen",
"tcp",
"-screen",
"0",
process.env.GEOMETRY || "",
"-ac",
"+extension",
"RANDR",
]);
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
//child_process.spawnSync("x11vnc", ["-storepasswd", process.env.VNC_PASS, path.join(homedir(), ".vnc", "passwd")]);
child_process.spawn("x11vnc", [
"-forever",
"-ncache_cr",
"-xdamage",
"-usepw",
"-shared",
"-rfbport",
"6080",
"-passwd",
process.env.VNC_PASS || "",
"-display",
DISPLAY,
]);
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
const browser = new Browser();
await browser.launch({
profileUrl: params.profile,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
headless: params.headless,
signals: false,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
chromeOptions: {
proxy: params.proxyServer,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
extraArgs: [
"--window-position=0,0",
`--window-size=${params.windowSize}`,
// to disable the 'stability will suffer' infobar
"--test-type",
],
},
recording: false,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
});
if (params.interactive) {
logger.warn(
"Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode",
);
}
if (params.user || params.password) {
params.automated = true;
}
if (!params.user && params.automated) {
params.user = await promptInput("Enter username: ");
}
if (!params.password && params.automated) {
params.password = await promptInput("Enter password: ", true);
}
const { page, cdp } = await browser.newWindowPageWithCDP();
const waitUntil: PuppeteerLifeCycleEvent = "load";
await page.setCacheEnabled(false);
if (!params.automated) {
await browser.setupPage({ page, cdp });
// for testing, inject browsertrix-behaviors
await browser.addInitScript(
page,
behaviors + ";\nself.__bx_behaviors.init();",
);
}
if (!params.automated) {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
const target = await cdp.send("Target.getTargetInfo");
const targetId = target.targetInfo.targetId;
const ibrowser = new InteractiveBrowser(
params,
browser,
page,
cdp,
targetId,
);
await ibrowser.startLoad(waitUntil);
} else {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await automatedProfile(params, browser, page, cdp, waitUntil);
}
}
async function automatedProfile(
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any,
browser: Browser,
page: Page,
cdp: CDPSession,
waitUntil: PuppeteerLifeCycleEvent,
) {
let u, p;
logger.info(`Loading page: ${params.url}`);
try {
await page.goto(params.url, { waitUntil });
} catch (e) {
logger.error("Page Load Failed/Interrupted", e);
}
logger.debug("Looking for username and password entry fields on page...");
try {
u = await page.waitForSelector(
"input[name='user'],input[name='username'],input[name='email']",
);
p = await page.waitForSelector(
"input[type='password'].input[name='pass'],input[name='password']",
);
} catch (e) {
if (params.debugScreenshot) {
await page.screenshot({ path: params.debugScreenshot });
}
logger.debug("Login form could not be found");
await page.close();
process.exit(1);
return;
}
await u!.type(params.user);
await p!.type(params.password);
await Promise.allSettled([
p!.press("Enter"),
page.waitForNavigation({ waitUntil }),
]);
if (params.debugScreenshot) {
await page.screenshot({ path: params.debugScreenshot });
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await createProfile(params, browser, page, cdp);
process.exit(0);
}
async function createProfile(
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any,
browser: Browser,
page: Page,
cdp: CDPSession,
targetFilename = "",
) {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await cdp.send("Network.clearBrowserCache");
await browser.close();
logger.info("Creating profile");
if (params.filename && !params.filename.startsWith("/")) {
params.filename = path.resolve("/crawls/profiles/", params.filename);
logger.info(
`Absolute path for filename not provided, saving to ${params.filename}`,
);
}
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
const outputDir = path.dirname(profileFilename);
if (outputDir && !fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
browser.saveProfile(profileFilename);
let resource = {};
const storage = initStorage();
if (storage) {
logger.info("Uploading to remote storage...");
resource = await storage.uploadFile(profileFilename, targetFilename);
}
logger.info("Profile creation done");
return resource;
}
function promptInput(msg: string, hidden = false) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const rl: any = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
if (hidden) {
// from https://stackoverflow.com/a/59727173
rl.input.on("keypress", function () {
// get the number of characters entered so far:
const len = rl.line.length;
// move cursor back to the beginning of the input:
readline.moveCursor(rl.output, -len, 0);
// clear everything to the right of the cursor:
readline.clearLine(rl.output, 1);
// replace the original input with asterisks:
for (let i = 0; i < len; i++) {
rl.output.write("*");
}
});
}
return new Promise<string>((resolve) => {
rl.question(msg, function (res: string) {
rl.close();
resolve(res);
});
});
}
class InteractiveBrowser {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any;
browser: Browser;
page: Page;
cdp: CDPSession;
targetId: string;
originSet = new Set<string>();
shutdownWait: number;
shutdownTimer: NodeJS.Timer | null;
constructor(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any,
browser: Browser,
page: Page,
cdp: CDPSession,
targetId: string,
) {
logger.info("Creating Profile Interactively...");
if (params.headless) {
child_process.spawn("socat", [
"tcp-listen:9222,reuseaddr,fork",
"tcp:localhost:9221",
]);
}
this.params = params;
this.browser = browser;
this.page = page;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
this.cdp = cdp;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
this.targetId = targetId;
this.addOrigin();
page.on("load", () => this.handlePageLoad());
// attempt to keep everything to initial tab if headless
if (this.params.headless) {
cdp.send("Page.enable");
cdp.on("Page.windowOpen", async (resp) => {
if (!resp.url) {
return;
}
try {
await cdp.send("Target.activateTarget", { targetId: this.targetId });
await page.goto(resp.url);
} catch (e) {
logger.error("Page Load Failed/Interrupted", e);
}
});
}
this.shutdownWait = params.shutdownWait * 1000;
if (this.shutdownWait) {
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
logger.debug(
`Shutting down in ${this.shutdownWait}ms if no ping received`,
);
} else {
this.shutdownTimer = null;
}
const httpServer = http.createServer((req, res) =>
this.handleRequest(req, res),
);
const port = 9223;
httpServer.listen(port);
logger.info(
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`,
);
if (!params.headless) {
logger.info("Screencasting with VNC on port 6080");
} else {
logger.info("Screencasting with CDP on port 9222");
}
}
async startLoad(waitUntil: PuppeteerLifeCycleEvent = "load") {
logger.info(`Loading page: ${this.params.url}`);
try {
await this.page.goto(this.params.url, { waitUntil, timeout: 0 });
logger.info("Loaded!");
} catch (e) {
logger.warn("Page Load Failed/Interrupted", e);
}
}
handlePageLoad() {
this.addOrigin();
this.saveCookiesFor(this.page.url());
}
async saveAllCookies() {
logger.info("Saving all cookies");
for (const origin of this.originSet.values()) {
await this.saveCookiesFor(origin + "/");
}
}
async saveCookiesFor(url: string) {
try {
if (this.params.cookieDays <= 0) {
return;
}
const cookies = await this.browser.getCookies(this.page);
for (const cookieOrig of cookies) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const cookie = cookieOrig as any;
cookie.expires =
new Date().getTime() / 1000 + this.params.cookieDays * 86400;
delete cookie.size;
delete cookie.session;
if (
cookie.sameSite &&
cookie.sameSite !== "Lax" &&
cookie.sameSite !== "Strict"
) {
delete cookie.sameSite;
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
if (!cookie.domain && !cookie.path) {
cookie.url = url;
}
}
await this.browser.setCookies(this.page, cookies);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Save Cookie Error: ", e);
}
}
addOrigin() {
const url = this.page.url();
logger.debug("Adding origin", { url });
if (url.startsWith("http:") || url.startsWith("https:")) {
this.originSet.add(new URL(url).origin);
}
}
async handleRequest(req: IncomingMessage, res: ServerResponse) {
const parsedUrl = new URL(req.url || "", `http://${req.headers.host}`);
const pathname = parsedUrl.pathname;
let targetUrl;
let origins;
switch (pathname) {
case "/":
res.writeHead(200, { "Content-Type": "text/html" });
if (this.params.headless) {
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
} else {
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
}
res.end(
profileHTML.replace(
"$DEVTOOLS_SRC",
targetUrl.replaceAll("$HOST", parsedUrl.hostname),
),
);
return;
case "/vnc/":
case "/vnc/index.html":
res.writeHead(200, { "Content-Type": "text/html" });
res.end(vncHTML);
return;
case "/ping":
if (this.shutdownWait) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
clearTimeout(this.shutdownTimer as any);
this.shutdownTimer = setTimeout(
() => process.exit(0),
this.shutdownWait,
);
logger.debug(
`Ping received, delaying shutdown for ${this.shutdownWait}ms`,
);
}
origins = Array.from(this.originSet.values());
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({ pong: true, origins }));
return;
case "/target":
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({ targetId: this.targetId }));
return;
case "/vncpass":
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({ password: process.env.VNC_PASS }));
return;
case "/navigate":
if (req.method !== "POST") {
break;
}
try {
const postData = await this.readBodyJson(req);
const url = new URL(postData.url).href;
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({ success: true }));
logger.info("Loading Page", { page: url });
this.page
.goto(url)
.catch((e) => logger.warn("Page Load Failed/Interrupted", e));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
res.writeHead(400, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: e.toString() }));
logger.warn("HTTP Error", e);
}
return;
case "/createProfileJS":
if (req.method !== "POST") {
break;
}
try {
const postData = await this.readBodyJson(req);
const targetFilename = postData.filename || "";
await this.saveAllCookies();
const resource = await createProfile(
this.params,
this.browser,
this.page,
this.cdp,
targetFilename,
);
origins = Array.from(this.originSet.values());
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify({ resource, origins }));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
res.writeHead(500, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: e.toString() }));
logger.warn("HTTP Error", e);
}
setTimeout(() => process.exit(0), 200);
return;
case "/createProfile":
if (req.method !== "POST") {
break;
}
try {
await this.saveAllCookies();
await createProfile(this.params, this.browser, this.page, this.cdp);
res.writeHead(200, { "Content-Type": "text/html" });
res.end(
"<html><body>Profile Created! You may now close this window.</body></html>",
);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
res.writeHead(500, { "Content-Type": "text/html" });
res.end(
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info",
);
logger.warn("HTTP Error", e);
}
setTimeout(() => process.exit(0), 200);
return;
}
if (pathname.startsWith("/vnc/")) {
const fileUrl = new URL(
"../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
import.meta.url,
);
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
res.writeHead(200, { "Content-Type": "application/javascript" });
res.end(file);
return;
}
res.writeHead(404, { "Content-Type": "text/html" });
res.end("Not Found");
}
async readBodyJson(req: IncomingMessage) {
const buffers = [];
for await (const chunk of req) {
buffers.push(chunk);
}
const data = Buffer.concat(buffers).toString();
if (data.length) {
try {
return JSON.parse(data) || {};
} catch (e) {
return {};
}
}
}
}
main();