Profile Creation Improvements (#136)

* interactive profile api improvements:
- refactor profile creation into separate class
- if profile starts with '@', load as relative path using current s3 storage
- support uploading profiles to s3
- profile api: support filename passed to /createProfieJS as part of json POST
- profile api: support /ping to keep profile browser running, --shutdownWait to add autoshutdown timeout (extendable via ping)
- profile api: add /target to retrieve target and /navigate to navigate by url.

* bump to 0.6.0-beta.0
This commit is contained in:
Ilya Kreymer 2022-05-05 14:27:17 -05:00 committed by GitHub
parent 5dfbfbeaf6
commit 500ed1f9a1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 199 additions and 51 deletions

View file

@ -25,7 +25,7 @@ const warcio = require("warcio");
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
const TextExtract = require("./util/textextract");
const { S3StorageSync, getFileSize } = require("./util/storage");
const { initStorage, getFileSize } = require("./util/storage");
const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/screencaster");
const { parseArgs } = require("./util/argParser");
const { initRedis } = require("./util/redis");
@ -361,23 +361,8 @@ class Crawler {
return;
}
if (this.params.generateWACZ && process.env.STORE_ENDPOINT_URL) {
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
const storeInfo = {
endpointUrl,
accessKey: process.env.STORE_ACCESS_KEY,
secretKey: process.env.STORE_SECRET_KEY,
};
const opts = {
crawlId: process.env.CRAWL_ID || os.hostname(),
webhookUrl: process.env.WEBHOOK_URL,
userId: process.env.STORE_USER,
filename: process.env.STORE_FILENAME || "@ts-@id.wacz",
};
console.log("Initing Storage...");
this.storage = new S3StorageSync(storeInfo, opts);
if (this.params.generateWACZ) {
this.storage = initStorage("data/");
}
// Puppeteer Cluster init and options

View file

@ -7,6 +7,7 @@ const puppeteer = require("puppeteer-core");
const yargs = require("yargs");
const { getBrowserExe, loadProfile, saveProfile, chromeArgs, sleep } = require("./util/browser");
const { initStorage } = require("./util/storage");
const fs = require("fs");
const path = require("path");
@ -15,7 +16,6 @@ const profileHTML = fs.readFileSync(path.join(__dirname, "html", "createProfile.
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
function cliOpts() {
return {
"url": {
@ -53,6 +53,12 @@ function cliOpts() {
default: false,
},
"shutdownWait": {
describe: "Shutdown browser in interactive after this many seconds, if no pings received",
type: "number",
default: 0
},
"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
@ -148,14 +154,15 @@ async function main() {
console.log("loading");
await page.goto(params.url, {waitUntil});
console.log("loaded");
if (params.interactive) {
await handleInteractive(params, browser, page);
new InteractiveBrowser(params, browser, page);
return;
}
let u, p;
try {
@ -190,7 +197,7 @@ async function main() {
process.exit(0);
}
async function createProfile(params, browser, page) {
async function createProfile(params, browser, page, targetFilename = "") {
await page._client.send("Network.clearBrowserCache");
await browser.close();
@ -201,7 +208,16 @@ async function createProfile(params, browser, page) {
saveProfile(profileFilename);
let resource = {};
const storage = initStorage("profiles/");
if (storage) {
console.log("Uploading to remote storage...");
resource = await storage.uploadFile(profileFilename, targetFilename);
}
console.log("done");
return resource;
}
function promptInput(msg, hidden = false) {
@ -234,25 +250,123 @@ function promptInput(msg, hidden = false) {
});
}
async function handleInteractive(params, browser, page) {
const target = page.target();
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${target._targetId}&panel=resources`;
console.log("Creating Profile Interactively...");
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
class InteractiveBrowser {
constructor(params, browser, page) {
console.log("Creating Profile Interactively...");
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
const httpServer = http.createServer(async (req, res) => {
this.params = params;
this.browser = browser;
this.page = page;
const target = page.target();
this.targetId = target._targetId;
this.originSet = new Set();
this.addOrigin();
page.on("load", () => this.addOrigin());
this.shutdownWait = params.shutdownWait * 1000;
if (this.shutdownWait) {
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
console.log(`Shutting down in ${this.shutdownWait}ms if no ping received`);
} else {
this.shutdownTimer = 0;
}
const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
const port = 9223;
httpServer.listen(port);
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
}
addOrigin() {
const url = this.page.url();
console.log("Adding origin for", url);
if (url.startsWith("http:") || url.startsWith("https:")) {
this.originSet.add(new URL(url).origin);
}
}
async handleRequest(req, res) {
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
const pathname = parsedUrl.pathname;
if (pathname === "/") {
let targetUrl;
switch (pathname) {
case "/":
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
res.writeHead(200, {"Content-Type": "text/html"});
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
return;
} else if (pathname === "/createProfile" && req.method === "POST") {
case "/ping":
if (this.shutdownWait) {
clearInterval(this.shutdownTimer);
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
console.log(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
}
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({"pong": true}));
return;
case "/target":
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({targetId: this.targetId}));
return;
case "/createProfileJS":
if (req.method !== "POST") {
break;
}
try {
await createProfile(params, browser, page);
const buffers = [];
for await (const chunk of req) {
buffers.push(chunk);
}
const data = Buffer.concat(buffers).toString();
let targetFilename = "";
if (data.length) {
try {
targetFilename = JSON.parse(data).filename;
} catch (e) {
targetFilename = "";
}
}
console.log("target filename", targetFilename);
const resource = await createProfile(this.params, this.browser, this.page, targetFilename);
const origins = Array.from(this.originSet.values());
res.writeHead(200, {"Content-Type": "application/json"});
res.end(JSON.stringify({resource, origins}));
} catch (e) {
res.writeHead(500, {"Content-Type": "application/json"});
res.end(JSON.stringify({"error": e.toString()}));
console.log(e);
}
setTimeout(() => process.exit(0), 200);
return;
case "/createProfile":
if (req.method !== "POST") {
break;
}
try {
await createProfile(this.params, this.browser, this.page);
res.writeHead(200, {"Content-Type": "text/html"});
res.end("<html><body>Profile Created! You may now close this window.</body></html>");
@ -263,17 +377,14 @@ async function handleInteractive(params, browser, page) {
}
setTimeout(() => process.exit(0), 200);
} else {
res.writeHead(404, {"Content-Type": "text/html"});
res.end("Not Found");
return;
}
});
const port = 9223;
httpServer.listen(port);
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
res.writeHead(404, {"Content-Type": "text/html"});
res.end("Not Found");
}
}
main();

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.5.1",
"version": "0.6.0-beta.0",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",

View file

@ -3,15 +3,16 @@ const fs = require("fs");
const path = require("path");
const os = require("os");
const request = require("request");
const { initStorage } = require("./storage");
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
module.exports.loadProfile = async function(profileFilename) {
const targetFilename = "/tmp/profile.tar.gz";
if (profileFilename &&
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
const targetFilename = "/tmp/profile.tar.gz";
console.log(`Downloading ${profileFilename} to ${targetFilename}`);
const p = new Promise((resolve, reject) => {
@ -23,6 +24,16 @@ module.exports.loadProfile = async function(profileFilename) {
await p;
profileFilename = targetFilename;
} else if (profileFilename && profileFilename.startsWith("@")) {
const storage = initStorage("");
if (!storage) {
throw new Error("Profile specified relative to s3 storage, but no S3 storage defined");
}
await storage.downloadFile(profileFilename.slice(1), targetFilename);
profileFilename = targetFilename;
}

View file

@ -13,7 +13,7 @@ const { initRedis } = require("./redis");
// ===========================================================================
class S3StorageSync
{
constructor(urlOrData, {filename, webhookUrl, userId, crawlId} = {}) {
constructor(urlOrData, {filename, webhookUrl, userId, crawlId, prefix = ""} = {}) {
let url;
let accessKey;
let secretKey;
@ -54,20 +54,36 @@ class S3StorageSync
this.crawlId = crawlId;
this.webhookUrl = webhookUrl;
this.filenamePrefix = prefix;
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.]/g, ""));
filename = filename.replace("@hostname", os.hostname());
filename = filename.replace("@id", this.crawlId);
this.waczFilename = "data/" + filename;
this.targetFilename = this.filenamePrefix + filename;
}
async uploadCollWACZ(filename, completed = true) {
await this.client.fPutObject(this.bucketName, this.objectPrefix + this.waczFilename, filename);
async uploadFile(srcFilename, targetFilename) {
// allow overriding targetFilename
if (targetFilename) {
targetFilename = this.filenamePrefix + targetFilename;
} else {
targetFilename = this.targetFilename;
}
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
const finalHash = await checksumFile("sha256", filename);
const finalHash = await checksumFile("sha256", srcFilename);
const size = await getFileSize(filename);
const resource = {"path": this.waczFilename, "hash": finalHash, "bytes": size};
const size = await getFileSize(srcFilename);
return {"path": targetFilename, "hash": finalHash, "bytes": size};
}
async downloadFile(srcFilename, destFilename) {
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
}
async uploadCollWACZ(srcFilename, completed = true) {
const resource = await this.uploadFile(srcFilename, this.targetFilename);
console.log(resource);
if (this.webhookUrl) {
@ -76,7 +92,7 @@ class S3StorageSync
user: this.userId,
//filename: `s3://${this.bucketName}/${this.objectPrefix}${this.waczFilename}`,
filename: this.fullPrefix + this.waczFilename,
filename: this.fullPrefix + this.targetFilename,
hash: resource.hash,
size: resource.bytes,
@ -100,6 +116,31 @@ class S3StorageSync
}
}
function initStorage(prefix = "") {
if (!process.env.STORE_ENDPOINT_URL) {
return null;
}
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
const storeInfo = {
endpointUrl,
accessKey: process.env.STORE_ACCESS_KEY,
secretKey: process.env.STORE_SECRET_KEY,
};
const opts = {
crawlId: process.env.CRAWL_ID || os.hostname(),
webhookUrl: process.env.WEBHOOK_URL,
userId: process.env.STORE_USER,
prefix,
filename: process.env.STORE_FILENAME || "@ts-@id.wacz",
};
console.log("Initing Storage...");
return new S3StorageSync(storeInfo, opts);
}
async function getFileSize(filename) {
const stats = await fsp.stat(filename);
return stats.size;
@ -117,5 +158,5 @@ function checksumFile(hashName, path) {
module.exports.S3StorageSync = S3StorageSync;
module.exports.getFileSize = getFileSize;
module.exports.initStorage = initStorage;