mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

* browser: just pass profileUrl and track if custom profile is used browser: don't disable service workers always (accidentally added as part of playwright migration) only disable if using profile, same as 0.8.x behavior fix for #288 * Fix full page screenshot (#296) --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
561 lines
15 KiB
JavaScript
Executable file
561 lines
15 KiB
JavaScript
Executable file
#!/usr/bin/env node
|
|
|
|
import fs from "fs";
|
|
import path from "path";
|
|
import http from "http";
|
|
|
|
import readline from "readline";
|
|
import child_process from "child_process";
|
|
|
|
import yargs from "yargs";
|
|
|
|
import { logger } from "./util/logger.js";
|
|
|
|
import { sleep } from "./util/timing.js";
|
|
import { Browser } from "./util/browser.js";
|
|
import { initStorage } from "./util/storage.js";
|
|
|
|
const profileHTML = fs.readFileSync(new URL("html/createProfile.html", import.meta.url), {encoding: "utf8"});
|
|
const vncHTML = fs.readFileSync(new URL("html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
|
|
|
|
const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
|
|
|
function cliOpts() {
|
|
return {
|
|
"url": {
|
|
describe: "The URL of the login page",
|
|
type: "string",
|
|
demandOption: true,
|
|
},
|
|
|
|
"user": {
|
|
describe: "The username for the login. If not specified, will be prompted",
|
|
},
|
|
|
|
"password": {
|
|
describe: "The password for the login. If not specified, will be prompted (recommended)",
|
|
},
|
|
|
|
"filename": {
|
|
describe: "The filename for the profile tarball",
|
|
default: "/crawls/profiles/profile.tar.gz",
|
|
},
|
|
|
|
"debugScreenshot": {
|
|
describe: "If specified, take a screenshot after login and save as this filename"
|
|
},
|
|
|
|
"headless": {
|
|
describe: "Run in headless mode, otherwise start xvfb",
|
|
type: "boolean",
|
|
default: false,
|
|
},
|
|
|
|
"automated": {
|
|
describe: "Start in automated mode, no interactive browser",
|
|
type: "boolean",
|
|
default: false,
|
|
},
|
|
|
|
"interactive": {
|
|
describe: "Deprecated. Now the default option!",
|
|
type: "boolean",
|
|
default: false
|
|
},
|
|
|
|
"shutdownWait": {
|
|
describe: "Shutdown browser in interactive after this many seconds, if no pings received",
|
|
type: "number",
|
|
default: 0
|
|
},
|
|
|
|
"profile": {
|
|
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
|
type: "string",
|
|
},
|
|
|
|
"windowSize": {
|
|
type: "string",
|
|
describe: "Browser window dimensions, specified as: width,height",
|
|
default: getDefaultWindowSize()
|
|
},
|
|
|
|
"proxy": {
|
|
type: "boolean",
|
|
default: false
|
|
},
|
|
|
|
"cookieDays": {
|
|
type: "number",
|
|
describe: "If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
|
default: 7
|
|
}
|
|
};
|
|
}
|
|
|
|
function getDefaultWindowSize() {
|
|
const values = process.env.GEOMETRY.split("x");
|
|
const x = Number(values[0]);
|
|
const y = Number(values[1]);
|
|
return `${x},${y}`;
|
|
}
|
|
|
|
|
|
|
|
async function main() {
|
|
const params = yargs(process.argv)
|
|
.usage("browsertrix-crawler profile [options]")
|
|
.option(cliOpts())
|
|
.argv;
|
|
|
|
logger.setDebugLogging(true);
|
|
|
|
|
|
if (!params.headless) {
|
|
logger.debug("Launching XVFB");
|
|
child_process.spawn("Xvfb", [
|
|
process.env.DISPLAY,
|
|
"-listen",
|
|
"tcp",
|
|
"-screen",
|
|
"0",
|
|
process.env.GEOMETRY,
|
|
"-ac",
|
|
"+extension",
|
|
"RANDR"
|
|
]);
|
|
|
|
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
|
|
|
|
//child_process.spawnSync("x11vnc", ["-storepasswd", process.env.VNC_PASS, path.join(homedir(), ".vnc", "passwd")]);
|
|
|
|
child_process.spawn("x11vnc", [
|
|
"-forever",
|
|
"-ncache_cr",
|
|
"-xdamage",
|
|
"-usepw",
|
|
"-shared",
|
|
"-rfbport",
|
|
"6080",
|
|
"-passwd",
|
|
process.env.VNC_PASS,
|
|
"-display",
|
|
process.env.DISPLAY
|
|
]);
|
|
}
|
|
|
|
let useProxy = false;
|
|
|
|
if (params.proxy) {
|
|
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
|
|
|
|
logger.debug("Running with pywb proxy");
|
|
|
|
await sleep(3000);
|
|
|
|
useProxy = true;
|
|
}
|
|
|
|
const browser = new Browser();
|
|
|
|
const profileDir = await browser.loadProfile(params.profile);
|
|
|
|
await browser.launch({
|
|
dataDir: profileDir,
|
|
headless: params.headless,
|
|
signals: true,
|
|
chromeOptions: {
|
|
proxy: useProxy,
|
|
extraArgs: [
|
|
"--window-position=0,0",
|
|
`--window-size=${params.windowSize}`,
|
|
// to disable the 'stability will suffer' infobar
|
|
"--test-type"
|
|
]
|
|
}
|
|
});
|
|
|
|
if (params.interactive) {
|
|
logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
|
|
}
|
|
|
|
if (params.user || params.password) {
|
|
params.automated = true;
|
|
}
|
|
|
|
if (!params.user && params.automated) {
|
|
params.user = await promptInput("Enter username: ");
|
|
}
|
|
|
|
if (!params.password && params.automated) {
|
|
params.password = await promptInput("Enter password: ", true);
|
|
}
|
|
|
|
const { page, cdp } = await browser.getFirstPageWithCDP();
|
|
|
|
const waitUntil = "load";
|
|
|
|
//await page.setCacheEnabled(false);
|
|
await cdp.send("Network.setCacheDisabled", {cacheDisabled: true});
|
|
|
|
if (!params.automated) {
|
|
await browser.setupPage({page, cdp});
|
|
|
|
// for testing, inject browsertrix-behaviors
|
|
await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
|
|
}
|
|
|
|
logger.info(`Loading page: ${params.url}`);
|
|
|
|
await page.goto(params.url, {waitUntil});
|
|
|
|
if (!params.automated) {
|
|
const target = await cdp.send("Target.getTargetInfo");
|
|
const targetId = target.targetInfo.targetId;
|
|
|
|
new InteractiveBrowser(params, browser, page, cdp, targetId);
|
|
} else {
|
|
await automatedProfile(params, browser, page, cdp, waitUntil);
|
|
}
|
|
}
|
|
|
|
async function automatedProfile(params, browser, page, cdp, waitUntil) {
|
|
let u, p;
|
|
|
|
logger.debug("Looking for username and password entry fields on page...");
|
|
|
|
try {
|
|
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
|
p = await page.waitForSelector("//input[contains(@name, 'pass') and @type='password']");
|
|
|
|
} catch (e) {
|
|
if (params.debugScreenshot) {
|
|
await page.screenshot({path: params.debugScreenshot});
|
|
}
|
|
logger.debug("Login form could not be found");
|
|
await page.close();
|
|
process.exit(1);
|
|
return;
|
|
}
|
|
|
|
await u.type(params.user);
|
|
|
|
await p.type(params.password);
|
|
|
|
await Promise.allSettled([
|
|
p.press("Enter"),
|
|
page.waitForNavigation({waitUntil})
|
|
]);
|
|
|
|
if (params.debugScreenshot) {
|
|
await page.screenshot({path: params.debugScreenshot});
|
|
}
|
|
|
|
await createProfile(params, browser, page, cdp);
|
|
|
|
process.exit(0);
|
|
}
|
|
|
|
async function createProfile(params, browser, page, cdp, targetFilename = "") {
|
|
await cdp.send("Network.clearBrowserCache");
|
|
|
|
await browser.close();
|
|
|
|
logger.info("Creating profile");
|
|
|
|
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
|
|
|
|
const outputDir = path.dirname(profileFilename);
|
|
if (outputDir && !fs.existsSync(outputDir)) {
|
|
fs.mkdirSync(outputDir, {recursive: true});
|
|
}
|
|
|
|
browser.saveProfile(profileFilename);
|
|
|
|
let resource = {};
|
|
|
|
const storage = initStorage();
|
|
if (storage) {
|
|
logger.info("Uploading to remote storage...");
|
|
resource = await storage.uploadFile(profileFilename, targetFilename);
|
|
}
|
|
|
|
logger.info("Profile creation done");
|
|
return resource;
|
|
}
|
|
|
|
function promptInput(msg, hidden = false) {
|
|
const rl = readline.createInterface({
|
|
input: process.stdin,
|
|
output: process.stdout
|
|
});
|
|
|
|
if (hidden) {
|
|
// from https://stackoverflow.com/a/59727173
|
|
rl.input.on("keypress", function () {
|
|
// get the number of characters entered so far:
|
|
const len = rl.line.length;
|
|
// move cursor back to the beginning of the input:
|
|
readline.moveCursor(rl.output, -len, 0);
|
|
// clear everything to the right of the cursor:
|
|
readline.clearLine(rl.output, 1);
|
|
// replace the original input with asterisks:
|
|
for (let i = 0; i < len; i++) {
|
|
rl.output.write("*");
|
|
}
|
|
});
|
|
}
|
|
|
|
return new Promise((resolve) => {
|
|
rl.question(msg, function (res) {
|
|
rl.close();
|
|
resolve(res);
|
|
});
|
|
});
|
|
}
|
|
|
|
|
|
class InteractiveBrowser {
|
|
constructor(params, browser, page, cdp, targetId) {
|
|
logger.info("Creating Profile Interactively...");
|
|
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
|
|
|
this.params = params;
|
|
this.browser = browser;
|
|
this.page = page;
|
|
this.cdp = cdp;
|
|
|
|
//const target = page.target();
|
|
this.targetId = targetId;
|
|
|
|
this.originSet = new Set();
|
|
|
|
this.addOrigin();
|
|
|
|
page.on("load", () => this.handlePageLoad());
|
|
|
|
page.on("popup", async () => {
|
|
await cdp.send("Target.activateTarget", {targetId: this.targetId});
|
|
});
|
|
|
|
cdp.on("Page.windowOpen", async (resp) => {
|
|
if (resp.url) {
|
|
await page.goto(resp.url);
|
|
}
|
|
});
|
|
|
|
this.shutdownWait = params.shutdownWait * 1000;
|
|
|
|
if (this.shutdownWait) {
|
|
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
|
logger.debug(`Shutting down in ${this.shutdownWait}ms if no ping received`);
|
|
} else {
|
|
this.shutdownTimer = 0;
|
|
}
|
|
|
|
const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
|
|
const port = 9223;
|
|
httpServer.listen(port);
|
|
logger.info(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
|
|
|
|
if (!params.headless) {
|
|
logger.info("Screencasting with VNC on port 6080");
|
|
} else {
|
|
logger.info("Screencasting with CDP on port 9222");
|
|
}
|
|
}
|
|
|
|
handlePageLoad() {
|
|
this.addOrigin();
|
|
this.saveCookiesFor(this.page.url());
|
|
}
|
|
|
|
async saveAllCookies() {
|
|
logger.info("Saving all cookies");
|
|
|
|
for (const origin of this.originSet.values()) {
|
|
await this.saveCookiesFor(origin + "/");
|
|
}
|
|
}
|
|
|
|
async saveCookiesFor(url) {
|
|
try {
|
|
if (this.params.cookieDays <= 0) {
|
|
return;
|
|
}
|
|
|
|
const cookies = await this.browser.context.cookies(url);
|
|
for (const cookie of cookies) {
|
|
cookie.expires = (new Date().getTime() / 1000) + this.params.cookieDays * 86400;
|
|
delete cookie.size;
|
|
delete cookie.session;
|
|
if (cookie.sameSite && cookie.sameSite !== "Lax" && cookie.sameSite !== "Strict") {
|
|
delete cookie.sameSite;
|
|
}
|
|
if (!cookie.domain && !cookie.path) {
|
|
cookie.url = url;
|
|
}
|
|
}
|
|
await this.browser.context.addCookies(cookies);
|
|
} catch (e) {
|
|
logger.error("Save Cookie Error: ", e);
|
|
}
|
|
}
|
|
|
|
addOrigin() {
|
|
const url = this.page.url();
|
|
logger.debug("Adding origin", {url});
|
|
if (url.startsWith("http:") || url.startsWith("https:")) {
|
|
this.originSet.add(new URL(url).origin);
|
|
}
|
|
}
|
|
|
|
async handleRequest(req, res) {
|
|
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
|
|
const pathname = parsedUrl.pathname;
|
|
let targetUrl;
|
|
let origins;
|
|
|
|
switch (pathname) {
|
|
case "/":
|
|
res.writeHead(200, {"Content-Type": "text/html"});
|
|
if (this.params.headless) {
|
|
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
|
} else {
|
|
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
|
|
}
|
|
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
|
|
return;
|
|
|
|
case "/vnc/":
|
|
case "/vnc/index.html":
|
|
res.writeHead(200, {"Content-Type": "text/html"});
|
|
res.end(vncHTML);
|
|
return;
|
|
|
|
case "/ping":
|
|
if (this.shutdownWait) {
|
|
clearInterval(this.shutdownTimer);
|
|
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
|
logger.debug(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
|
|
}
|
|
|
|
origins = Array.from(this.originSet.values());
|
|
|
|
res.writeHead(200, {"Content-Type": "application/json"});
|
|
|
|
res.end(JSON.stringify({pong: true, origins}));
|
|
return;
|
|
|
|
case "/target":
|
|
res.writeHead(200, {"Content-Type": "application/json"});
|
|
res.end(JSON.stringify({targetId: this.targetId}));
|
|
return;
|
|
|
|
case "/vncpass":
|
|
res.writeHead(200, {"Content-Type": "application/json"});
|
|
res.end(JSON.stringify({password: process.env.VNC_PASS}));
|
|
return;
|
|
|
|
case "/navigate":
|
|
if (req.method !== "POST") {
|
|
break;
|
|
}
|
|
|
|
try {
|
|
const postData = await this.readBodyJson(req);
|
|
const url = new URL(postData.url).href;
|
|
|
|
res.writeHead(200, {"Content-Type": "application/json"});
|
|
res.end(JSON.stringify({success: true}));
|
|
|
|
this.page.goto(url);
|
|
|
|
} catch (e) {
|
|
res.writeHead(400, {"Content-Type": "application/json"});
|
|
res.end(JSON.stringify({"error": e.toString()}));
|
|
logger.warn("HTTP Error", e);
|
|
}
|
|
return;
|
|
|
|
case "/createProfileJS":
|
|
if (req.method !== "POST") {
|
|
break;
|
|
}
|
|
|
|
try {
|
|
const postData = await this.readBodyJson(req);
|
|
const targetFilename = postData.filename || "";
|
|
|
|
await this.saveAllCookies();
|
|
|
|
const resource = await createProfile(this.params, this.browser, this.page, this.cdp, targetFilename);
|
|
origins = Array.from(this.originSet.values());
|
|
|
|
res.writeHead(200, {"Content-Type": "application/json"});
|
|
res.end(JSON.stringify({resource, origins}));
|
|
} catch (e) {
|
|
res.writeHead(500, {"Content-Type": "application/json"});
|
|
res.end(JSON.stringify({"error": e.toString()}));
|
|
logger.warn("HTTP Error", e);
|
|
}
|
|
|
|
setTimeout(() => process.exit(0), 200);
|
|
return;
|
|
|
|
case "/createProfile":
|
|
if (req.method !== "POST") {
|
|
break;
|
|
}
|
|
|
|
try {
|
|
await this.saveAllCookies();
|
|
|
|
await createProfile(this.params, this.browser, this.page, this.cdp);
|
|
|
|
res.writeHead(200, {"Content-Type": "text/html"});
|
|
res.end("<html><body>Profile Created! You may now close this window.</body></html>");
|
|
} catch (e) {
|
|
res.writeHead(500, {"Content-Type": "text/html"});
|
|
res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info");
|
|
logger.warn("HTTP Error", e);
|
|
}
|
|
|
|
setTimeout(() => process.exit(0), 200);
|
|
return;
|
|
}
|
|
|
|
if (pathname.startsWith("/vnc/")) {
|
|
const fileUrl = new URL("node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length), import.meta.url);
|
|
const file = fs.readFileSync(fileUrl, {encoding: "utf-8"});
|
|
res.writeHead(200, {"Content-Type": "application/javascript"});
|
|
res.end(file);
|
|
return;
|
|
}
|
|
|
|
res.writeHead(404, {"Content-Type": "text/html"});
|
|
res.end("Not Found");
|
|
}
|
|
|
|
async readBodyJson(req) {
|
|
const buffers = [];
|
|
|
|
for await (const chunk of req) {
|
|
buffers.push(chunk);
|
|
}
|
|
|
|
const data = Buffer.concat(buffers).toString();
|
|
|
|
if (data.length) {
|
|
try {
|
|
return JSON.parse(data) || {};
|
|
} catch (e) {
|
|
return {};
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
main();
|
|
|