browsertrix-crawler/create-login-profile.js
Ilya Kreymer 6531d52e2c browser: just pass profileUrl and track if custom profile is used
browser: don't disable service workers always (accidentally added as part of playwright migration)
only disable if using profile, same as 0.8.x behavior
potential fix for #288
bump version to 0.9.1
2023-04-22 12:35:24 -07:00

561 lines
15 KiB
JavaScript
Executable file

#!/usr/bin/env node
import fs from "fs";
import path from "path";
import http from "http";
import readline from "readline";
import child_process from "child_process";
import yargs from "yargs";
import { logger } from "./util/logger.js";
import { sleep } from "./util/timing.js";
import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js";
const profileHTML = fs.readFileSync(new URL("html/createProfile.html", import.meta.url), {encoding: "utf8"});
const vncHTML = fs.readFileSync(new URL("html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
function cliOpts() {
return {
"url": {
describe: "The URL of the login page",
type: "string",
demandOption: true,
},
"user": {
describe: "The username for the login. If not specified, will be prompted",
},
"password": {
describe: "The password for the login. If not specified, will be prompted (recommended)",
},
"filename": {
describe: "The filename for the profile tarball",
default: "/crawls/profiles/profile.tar.gz",
},
"debugScreenshot": {
describe: "If specified, take a screenshot after login and save as this filename"
},
"headless": {
describe: "Run in headless mode, otherwise start xvfb",
type: "boolean",
default: false,
},
"automated": {
describe: "Start in automated mode, no interactive browser",
type: "boolean",
default: false,
},
"interactive": {
describe: "Deprecated. Now the default option!",
type: "boolean",
default: false
},
"shutdownWait": {
describe: "Shutdown browser in interactive after this many seconds, if no pings received",
type: "number",
default: 0
},
"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
},
"windowSize": {
type: "string",
describe: "Browser window dimensions, specified as: width,height",
default: getDefaultWindowSize()
},
"proxy": {
type: "boolean",
default: false
},
"cookieDays": {
type: "number",
describe: "If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
default: 7
}
};
}
function getDefaultWindowSize() {
const values = process.env.GEOMETRY.split("x");
const x = Number(values[0]);
const y = Number(values[1]);
return `${x},${y}`;
}
async function main() {
const params = yargs(process.argv)
.usage("browsertrix-crawler profile [options]")
.option(cliOpts())
.argv;
logger.setDebugLogging(true);
if (!params.headless) {
logger.debug("Launching XVFB");
child_process.spawn("Xvfb", [
process.env.DISPLAY,
"-listen",
"tcp",
"-screen",
"0",
process.env.GEOMETRY,
"-ac",
"+extension",
"RANDR"
]);
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
//child_process.spawnSync("x11vnc", ["-storepasswd", process.env.VNC_PASS, path.join(homedir(), ".vnc", "passwd")]);
child_process.spawn("x11vnc", [
"-forever",
"-ncache_cr",
"-xdamage",
"-usepw",
"-shared",
"-rfbport",
"6080",
"-passwd",
process.env.VNC_PASS,
"-display",
process.env.DISPLAY
]);
}
let useProxy = false;
if (params.proxy) {
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
logger.debug("Running with pywb proxy");
await sleep(3000);
useProxy = true;
}
const browser = new Browser();
const profileDir = await browser.loadProfile(params.profile);
await browser.launch({
dataDir: profileDir,
headless: params.headless,
signals: true,
chromeOptions: {
proxy: useProxy,
extraArgs: [
"--window-position=0,0",
`--window-size=${params.windowSize}`,
// to disable the 'stability will suffer' infobar
"--test-type"
]
}
});
if (params.interactive) {
logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
}
if (params.user || params.password) {
params.automated = true;
}
if (!params.user && params.automated) {
params.user = await promptInput("Enter username: ");
}
if (!params.password && params.automated) {
params.password = await promptInput("Enter password: ", true);
}
const { page, cdp } = await browser.getFirstPageWithCDP();
const waitUntil = "load";
//await page.setCacheEnabled(false);
await cdp.send("Network.setCacheDisabled", {cacheDisabled: true});
if (!params.automated) {
await browser.setupPage({page, cdp});
// for testing, inject browsertrix-behaviors
await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
}
logger.info(`Loading page: ${params.url}`);
await page.goto(params.url, {waitUntil});
if (!params.automated) {
const target = await cdp.send("Target.getTargetInfo");
const targetId = target.targetInfo.targetId;
new InteractiveBrowser(params, browser, page, cdp, targetId);
} else {
await automatedProfile(params, browser, page, cdp, waitUntil);
}
}
async function automatedProfile(params, browser, page, cdp, waitUntil) {
let u, p;
logger.debug("Looking for username and password entry fields on page...");
try {
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
p = await page.waitForSelector("//input[contains(@name, 'pass') and @type='password']");
} catch (e) {
if (params.debugScreenshot) {
await page.screenshot({path: params.debugScreenshot});
}
logger.debug("Login form could not be found");
await page.close();
process.exit(1);
return;
}
await u.type(params.user);
await p.type(params.password);
await Promise.allSettled([
p.press("Enter"),
page.waitForNavigation({waitUntil})
]);
if (params.debugScreenshot) {
await page.screenshot({path: params.debugScreenshot});
}
await createProfile(params, browser, page, cdp);
process.exit(0);
}
async function createProfile(params, browser, page, cdp, targetFilename = "") {
await cdp.send("Network.clearBrowserCache");
await browser.close();
logger.info("Creating profile");
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
const outputDir = path.dirname(profileFilename);
if (outputDir && !fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, {recursive: true});
}
browser.saveProfile(profileFilename);
let resource = {};
const storage = initStorage();
if (storage) {
logger.info("Uploading to remote storage...");
resource = await storage.uploadFile(profileFilename, targetFilename);
}
logger.info("Profile creation done");
return resource;
}
function promptInput(msg, hidden = false) {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
if (hidden) {
// from https://stackoverflow.com/a/59727173
rl.input.on("keypress", function () {
// get the number of characters entered so far:
const len = rl.line.length;
// move cursor back to the beginning of the input:
readline.moveCursor(rl.output, -len, 0);
// clear everything to the right of the cursor:
readline.clearLine(rl.output, 1);
// replace the original input with asterisks:
for (let i = 0; i < len; i++) {
rl.output.write("*");
}
});
}
return new Promise((resolve) => {
rl.question(msg, function (res) {
rl.close();
resolve(res);
});
});
}
class InteractiveBrowser {
constructor(params, browser, page, cdp, targetId) {
logger.info("Creating Profile Interactively...");
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
this.params = params;
this.browser = browser;
this.page = page;
this.cdp = cdp;
//const target = page.target();
this.targetId = targetId;
this.originSet = new Set();
this.addOrigin();
page.on("load", () => this.handlePageLoad());
page.on("popup", async () => {
await cdp.send("Target.activateTarget", {targetId: this.targetId});
});
cdp.on("Page.windowOpen", async (resp) => {
if (resp.url) {
await page.goto(resp.url);
}
});
this.shutdownWait = params.shutdownWait * 1000;
if (this.shutdownWait) {
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
logger.debug(`Shutting down in ${this.shutdownWait}ms if no ping received`);
} else {
this.shutdownTimer = 0;
}
const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
const port = 9223;
httpServer.listen(port);
logger.info(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
if (!params.headless) {
logger.info("Screencasting with VNC on port 6080");
} else {
logger.info("Screencasting with CDP on port 9222");
}
}
handlePageLoad() {
this.addOrigin();
this.saveCookiesFor(this.page.url());
}
async saveAllCookies() {
logger.info("Saving all cookies");
for (const origin of this.originSet.values()) {
await this.saveCookiesFor(origin + "/");
}
}
async saveCookiesFor(url) {
try {
if (this.params.cookieDays <= 0) {
return;
}
const cookies = await this.browser.context.cookies(url);
for (const cookie of cookies) {
cookie.expires = (new Date().getTime() / 1000) + this.params.cookieDays * 86400;
delete cookie.size;
delete cookie.session;
if (cookie.sameSite && cookie.sameSite !== "Lax" && cookie.sameSite !== "Strict") {
delete cookie.sameSite;
}
if (!cookie.domain && !cookie.path) {
cookie.url = url;
}
}
await this.browser.context.addCookies(cookies);
} catch (e) {
logger.error("Save Cookie Error: ", e);
}
}
addOrigin() {
const url = this.page.url();
logger.debug("Adding origin", {url});
if (url.startsWith("http:") || url.startsWith("https:")) {
this.originSet.add(new URL(url).origin);
}
}
async handleRequest(req, res) {
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
const pathname = parsedUrl.pathname;
let targetUrl;
let origins;
switch (pathname) {
case "/":
res.writeHead(200, {"Content-Type": "text/html"});
if (this.params.headless) {
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
} else {
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
}
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
return;
case "/vnc/":
case "/vnc/index.html":
res.writeHead(200, {"Content-Type": "text/html"});
res.end(vncHTML);
return;
case "/ping":
if (this.shutdownWait) {
clearInterval(this.shutdownTimer);
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
logger.debug(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
}
origins = Array.from(this.originSet.values());
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({pong: true, origins}));
return;
case "/target":
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({targetId: this.targetId}));
return;
case "/vncpass":
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({password: process.env.VNC_PASS}));
return;
case "/navigate":
if (req.method !== "POST") {
break;
}
try {
const postData = await this.readBodyJson(req);
const url = new URL(postData.url).href;
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({success: true}));
this.page.goto(url);
} catch (e) {
res.writeHead(400, {"Content-Type": "application/json"});
res.end(JSON.stringify({"error": e.toString()}));
logger.warn("HTTP Error", e);
}
return;
case "/createProfileJS":
if (req.method !== "POST") {
break;
}
try {
const postData = await this.readBodyJson(req);
const targetFilename = postData.filename || "";
await this.saveAllCookies();
const resource = await createProfile(this.params, this.browser, this.page, this.cdp, targetFilename);
origins = Array.from(this.originSet.values());
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({resource, origins}));
} catch (e) {
res.writeHead(500, {"Content-Type": "application/json"});
res.end(JSON.stringify({"error": e.toString()}));
logger.warn("HTTP Error", e);
}
setTimeout(() => process.exit(0), 200);
return;
case "/createProfile":
if (req.method !== "POST") {
break;
}
try {
await this.saveAllCookies();
await createProfile(this.params, this.browser, this.page, this.cdp);
res.writeHead(200, {"Content-Type": "text/html"});
res.end("<html><body>Profile Created! You may now close this window.</body></html>");
} catch (e) {
res.writeHead(500, {"Content-Type": "text/html"});
res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info");
logger.warn("HTTP Error", e);
}
setTimeout(() => process.exit(0), 200);
return;
}
if (pathname.startsWith("/vnc/")) {
const fileUrl = new URL("node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length), import.meta.url);
const file = fs.readFileSync(fileUrl, {encoding: "utf-8"});
res.writeHead(200, {"Content-Type": "application/javascript"});
res.end(file);
return;
}
res.writeHead(404, {"Content-Type": "text/html"});
res.end("Not Found");
}
async readBodyJson(req) {
const buffers = [];
for await (const chunk of req) {
buffers.push(chunk);
}
const data = Buffer.concat(buffers).toString();
if (data.length) {
try {
return JSON.parse(data) || {};
} catch (e) {
return {};
}
}
}
}
main();