#!/usr/bin/env node import fs from "fs"; import path from "path"; import http from "http"; import readline from "readline"; import child_process from "child_process"; import yargs from "yargs"; import { logger } from "./util/logger.js"; import { Browser } from "./util/browser.js"; import { initStorage } from "./util/storage.js"; const profileHTML = fs.readFileSync(new URL("html/createProfile.html", import.meta.url), {encoding: "utf8"}); const vncHTML = fs.readFileSync(new URL("html/vnc_lite.html", import.meta.url), {encoding: "utf8"}); const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"}); function cliOpts() { return { "url": { describe: "The URL of the login page", type: "string", demandOption: true, }, "user": { describe: "The username for the login. If not specified, will be prompted", }, "password": { describe: "The password for the login. If not specified, will be prompted (recommended)", }, "filename": { describe: "The filename for the profile tarball", default: "/crawls/profiles/profile.tar.gz", }, "debugScreenshot": { describe: "If specified, take a screenshot after login and save as this filename" }, "headless": { describe: "Run in headless mode, otherwise start xvfb", type: "boolean", default: false, }, "automated": { describe: "Start in automated mode, no interactive browser", type: "boolean", default: false, }, "interactive": { describe: "Deprecated. Now the default option!", type: "boolean", default: false }, "shutdownWait": { describe: "Shutdown browser in interactive after this many seconds, if no pings received", type: "number", default: 0 }, "profile": { describe: "Path to tar.gz file which will be extracted and used as the browser profile", type: "string", }, "windowSize": { type: "string", describe: "Browser window dimensions, specified as: width,height", default: getDefaultWindowSize() }, "proxy": { type: "boolean", default: false }, "cookieDays": { type: "number", describe: "If >0, set all cookies, including session cookies, to have this duration in days before saving profile", default: 7 } }; } function getDefaultWindowSize() { const values = process.env.GEOMETRY.split("x"); const x = Number(values[0]); const y = Number(values[1]); return `${x},${y}`; } async function main() { const params = yargs(process.argv) .usage("browsertrix-crawler profile [options]") .option(cliOpts()) .argv; logger.setDebugLogging(true); if (!params.headless) { logger.debug("Launching XVFB"); child_process.spawn("Xvfb", [ process.env.DISPLAY, "-listen", "tcp", "-screen", "0", process.env.GEOMETRY, "-ac", "+extension", "RANDR" ]); //await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true}); //child_process.spawnSync("x11vnc", ["-storepasswd", process.env.VNC_PASS, path.join(homedir(), ".vnc", "passwd")]); child_process.spawn("x11vnc", [ "-forever", "-ncache_cr", "-xdamage", "-usepw", "-shared", "-rfbport", "6080", "-passwd", process.env.VNC_PASS, "-display", process.env.DISPLAY ]); } const browser = new Browser(); await browser.launch({ profileUrl: params.profile, headless: params.headless, signals: true, chromeOptions: { proxy: false, extraArgs: [ "--window-position=0,0", `--window-size=${params.windowSize}`, // to disable the 'stability will suffer' infobar "--test-type" ] } }); if (params.interactive) { logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode"); } if (params.user || params.password) { params.automated = true; } if (!params.user && params.automated) { params.user = await promptInput("Enter username: "); } if (!params.password && params.automated) { params.password = await promptInput("Enter password: ", true); } const { page, cdp } = await browser.newWindowPageWithCDP(); const waitUntil = "load"; await page.setCacheEnabled(false); if (!params.automated) { await browser.setupPage({page, cdp}); // for testing, inject browsertrix-behaviors await browser.addInitScript(page, behaviors + ";\nself.__bx_behaviors.init();"); } logger.info(`Loading page: ${params.url}`); await page.goto(params.url, {waitUntil}); if (!params.automated) { const target = await cdp.send("Target.getTargetInfo"); const targetId = target.targetInfo.targetId; new InteractiveBrowser(params, browser, page, cdp, targetId); } else { await automatedProfile(params, browser, page, cdp, waitUntil); } } async function automatedProfile(params, browser, page, cdp, waitUntil) { let u, p; logger.debug("Looking for username and password entry fields on page..."); try { u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]"); p = await page.waitForSelector("//input[contains(@name, 'pass') and @type='password']"); } catch (e) { if (params.debugScreenshot) { await page.screenshot({path: params.debugScreenshot}); } logger.debug("Login form could not be found"); await page.close(); process.exit(1); return; } await u.type(params.user); await p.type(params.password); await Promise.allSettled([ p.press("Enter"), page.waitForNavigation({waitUntil}) ]); if (params.debugScreenshot) { await page.screenshot({path: params.debugScreenshot}); } await createProfile(params, browser, page, cdp); process.exit(0); } async function createProfile(params, browser, page, cdp, targetFilename = "") { await cdp.send("Network.clearBrowserCache"); await browser.close(); logger.info("Creating profile"); const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz"; const outputDir = path.dirname(profileFilename); if (outputDir && !fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, {recursive: true}); } browser.saveProfile(profileFilename); let resource = {}; const storage = initStorage(); if (storage) { logger.info("Uploading to remote storage..."); resource = await storage.uploadFile(profileFilename, targetFilename); } logger.info("Profile creation done"); return resource; } function promptInput(msg, hidden = false) { const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); if (hidden) { // from https://stackoverflow.com/a/59727173 rl.input.on("keypress", function () { // get the number of characters entered so far: const len = rl.line.length; // move cursor back to the beginning of the input: readline.moveCursor(rl.output, -len, 0); // clear everything to the right of the cursor: readline.clearLine(rl.output, 1); // replace the original input with asterisks: for (let i = 0; i < len; i++) { rl.output.write("*"); } }); } return new Promise((resolve) => { rl.question(msg, function (res) { rl.close(); resolve(res); }); }); } class InteractiveBrowser { constructor(params, browser, page, cdp, targetId) { logger.info("Creating Profile Interactively..."); child_process.spawn("socat", ["tcp-listen:9222,reuseaddr,fork", "tcp:localhost:9221"]); this.params = params; this.browser = browser; this.page = page; this.cdp = cdp; this.targetId = targetId; this.originSet = new Set(); this.addOrigin(); page.on("load", () => this.handlePageLoad()); // attempt to keep everything to initial tab if headless if (this.params.headless) { cdp.send("Page.enable"); cdp.on("Page.windowOpen", async (resp) => { if (resp.url) { await cdp.send("Target.activateTarget", {targetId: this.targetId}); await page.goto(resp.url); } }); } this.shutdownWait = params.shutdownWait * 1000; if (this.shutdownWait) { this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait); logger.debug(`Shutting down in ${this.shutdownWait}ms if no ping received`); } else { this.shutdownTimer = 0; } const httpServer = http.createServer((req, res) => this.handleRequest(req, res)); const port = 9223; httpServer.listen(port); logger.info(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`); if (!params.headless) { logger.info("Screencasting with VNC on port 6080"); } else { logger.info("Screencasting with CDP on port 9222"); } } handlePageLoad() { this.addOrigin(); this.saveCookiesFor(this.page.url()); } async saveAllCookies() { logger.info("Saving all cookies"); for (const origin of this.originSet.values()) { await this.saveCookiesFor(origin + "/"); } } async saveCookiesFor(url) { try { if (this.params.cookieDays <= 0) { return; } const cookies = await this.browser.getCookies(this.page, url); for (const cookie of cookies) { cookie.expires = (new Date().getTime() / 1000) + this.params.cookieDays * 86400; delete cookie.size; delete cookie.session; if (cookie.sameSite && cookie.sameSite !== "Lax" && cookie.sameSite !== "Strict") { delete cookie.sameSite; } if (!cookie.domain && !cookie.path) { cookie.url = url; } } await this.browser.setCookies(this.page, cookies); } catch (e) { logger.error("Save Cookie Error: ", e); } } addOrigin() { const url = this.page.url(); logger.debug("Adding origin", {url}); if (url.startsWith("http:") || url.startsWith("https:")) { this.originSet.add(new URL(url).origin); } } async handleRequest(req, res) { const parsedUrl = new URL(req.url, `http://${req.headers.host}`); const pathname = parsedUrl.pathname; let targetUrl; let origins; switch (pathname) { case "/": res.writeHead(200, {"Content-Type": "text/html"}); if (this.params.headless) { targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`; } else { targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`; } res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname))); return; case "/vnc/": case "/vnc/index.html": res.writeHead(200, {"Content-Type": "text/html"}); res.end(vncHTML); return; case "/ping": if (this.shutdownWait) { clearInterval(this.shutdownTimer); this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait); logger.debug(`Ping received, delaying shutdown for ${this.shutdownWait}ms`); } origins = Array.from(this.originSet.values()); res.writeHead(200, {"Content-Type": "application/json"}); res.end(JSON.stringify({pong: true, origins})); return; case "/target": res.writeHead(200, {"Content-Type": "application/json"}); res.end(JSON.stringify({targetId: this.targetId})); return; case "/vncpass": res.writeHead(200, {"Content-Type": "application/json"}); res.end(JSON.stringify({password: process.env.VNC_PASS})); return; case "/navigate": if (req.method !== "POST") { break; } try { const postData = await this.readBodyJson(req); const url = new URL(postData.url).href; res.writeHead(200, {"Content-Type": "application/json"}); res.end(JSON.stringify({success: true})); this.page.goto(url); } catch (e) { res.writeHead(400, {"Content-Type": "application/json"}); res.end(JSON.stringify({"error": e.toString()})); logger.warn("HTTP Error", e); } return; case "/createProfileJS": if (req.method !== "POST") { break; } try { const postData = await this.readBodyJson(req); const targetFilename = postData.filename || ""; await this.saveAllCookies(); const resource = await createProfile(this.params, this.browser, this.page, this.cdp, targetFilename); origins = Array.from(this.originSet.values()); res.writeHead(200, {"Content-Type": "application/json"}); res.end(JSON.stringify({resource, origins})); } catch (e) { res.writeHead(500, {"Content-Type": "application/json"}); res.end(JSON.stringify({"error": e.toString()})); logger.warn("HTTP Error", e); } setTimeout(() => process.exit(0), 200); return; case "/createProfile": if (req.method !== "POST") { break; } try { await this.saveAllCookies(); await createProfile(this.params, this.browser, this.page, this.cdp); res.writeHead(200, {"Content-Type": "text/html"}); res.end("Profile Created! You may now close this window."); } catch (e) { res.writeHead(500, {"Content-Type": "text/html"}); res.end("Profile creation failed! See the browsertrix-crawler console for more info"); logger.warn("HTTP Error", e); } setTimeout(() => process.exit(0), 200); return; } if (pathname.startsWith("/vnc/")) { const fileUrl = new URL("node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length), import.meta.url); const file = fs.readFileSync(fileUrl, {encoding: "utf-8"}); res.writeHead(200, {"Content-Type": "application/javascript"}); res.end(file); return; } res.writeHead(404, {"Content-Type": "text/html"}); res.end("Not Found"); } async readBodyJson(req) { const buffers = []; for await (const chunk of req) { buffers.push(chunk); } const data = Buffer.concat(buffers).toString(); if (data.length) { try { return JSON.parse(data) || {}; } catch (e) { return {}; } } } } main();