mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Profile Creation Improvements (#136)
* interactive profile api improvements: - refactor profile creation into separate class - if profile starts with '@', load as relative path using current s3 storage - support uploading profiles to s3 - profile api: support filename passed to /createProfieJS as part of json POST - profile api: support /ping to keep profile browser running, --shutdownWait to add autoshutdown timeout (extendable via ping) - profile api: add /target to retrieve target and /navigate to navigate by url. * bump to 0.6.0-beta.0
This commit is contained in:
parent
5dfbfbeaf6
commit
500ed1f9a1
5 changed files with 199 additions and 51 deletions
21
crawler.js
21
crawler.js
|
@ -25,7 +25,7 @@ const warcio = require("warcio");
|
|||
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
|
||||
|
||||
const TextExtract = require("./util/textextract");
|
||||
const { S3StorageSync, getFileSize } = require("./util/storage");
|
||||
const { initStorage, getFileSize } = require("./util/storage");
|
||||
const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/screencaster");
|
||||
const { parseArgs } = require("./util/argParser");
|
||||
const { initRedis } = require("./util/redis");
|
||||
|
@ -361,23 +361,8 @@ class Crawler {
|
|||
return;
|
||||
}
|
||||
|
||||
if (this.params.generateWACZ && process.env.STORE_ENDPOINT_URL) {
|
||||
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||
const storeInfo = {
|
||||
endpointUrl,
|
||||
accessKey: process.env.STORE_ACCESS_KEY,
|
||||
secretKey: process.env.STORE_SECRET_KEY,
|
||||
};
|
||||
|
||||
const opts = {
|
||||
crawlId: process.env.CRAWL_ID || os.hostname(),
|
||||
webhookUrl: process.env.WEBHOOK_URL,
|
||||
userId: process.env.STORE_USER,
|
||||
filename: process.env.STORE_FILENAME || "@ts-@id.wacz",
|
||||
};
|
||||
|
||||
console.log("Initing Storage...");
|
||||
this.storage = new S3StorageSync(storeInfo, opts);
|
||||
if (this.params.generateWACZ) {
|
||||
this.storage = initStorage("data/");
|
||||
}
|
||||
|
||||
// Puppeteer Cluster init and options
|
||||
|
|
|
@ -7,6 +7,7 @@ const puppeteer = require("puppeteer-core");
|
|||
const yargs = require("yargs");
|
||||
|
||||
const { getBrowserExe, loadProfile, saveProfile, chromeArgs, sleep } = require("./util/browser");
|
||||
const { initStorage } = require("./util/storage");
|
||||
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
@ -15,7 +16,6 @@ const profileHTML = fs.readFileSync(path.join(__dirname, "html", "createProfile.
|
|||
|
||||
const behaviors = fs.readFileSync(path.join(__dirname, "node_modules", "browsertrix-behaviors", "dist", "behaviors.js"), {encoding: "utf8"});
|
||||
|
||||
|
||||
function cliOpts() {
|
||||
return {
|
||||
"url": {
|
||||
|
@ -53,6 +53,12 @@ function cliOpts() {
|
|||
default: false,
|
||||
},
|
||||
|
||||
"shutdownWait": {
|
||||
describe: "Shutdown browser in interactive after this many seconds, if no pings received",
|
||||
type: "number",
|
||||
default: 0
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
|
@ -148,14 +154,15 @@ async function main() {
|
|||
console.log("loading");
|
||||
|
||||
await page.goto(params.url, {waitUntil});
|
||||
|
||||
|
||||
console.log("loaded");
|
||||
|
||||
if (params.interactive) {
|
||||
await handleInteractive(params, browser, page);
|
||||
new InteractiveBrowser(params, browser, page);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
let u, p;
|
||||
|
||||
try {
|
||||
|
@ -190,7 +197,7 @@ async function main() {
|
|||
process.exit(0);
|
||||
}
|
||||
|
||||
async function createProfile(params, browser, page) {
|
||||
async function createProfile(params, browser, page, targetFilename = "") {
|
||||
await page._client.send("Network.clearBrowserCache");
|
||||
|
||||
await browser.close();
|
||||
|
@ -201,7 +208,16 @@ async function createProfile(params, browser, page) {
|
|||
|
||||
saveProfile(profileFilename);
|
||||
|
||||
let resource = {};
|
||||
|
||||
const storage = initStorage("profiles/");
|
||||
if (storage) {
|
||||
console.log("Uploading to remote storage...");
|
||||
resource = await storage.uploadFile(profileFilename, targetFilename);
|
||||
}
|
||||
|
||||
console.log("done");
|
||||
return resource;
|
||||
}
|
||||
|
||||
function promptInput(msg, hidden = false) {
|
||||
|
@ -234,25 +250,123 @@ function promptInput(msg, hidden = false) {
|
|||
});
|
||||
}
|
||||
|
||||
async function handleInteractive(params, browser, page) {
|
||||
const target = page.target();
|
||||
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${target._targetId}&panel=resources`;
|
||||
|
||||
console.log("Creating Profile Interactively...");
|
||||
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
||||
class InteractiveBrowser {
|
||||
constructor(params, browser, page) {
|
||||
console.log("Creating Profile Interactively...");
|
||||
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
||||
|
||||
const httpServer = http.createServer(async (req, res) => {
|
||||
this.params = params;
|
||||
this.browser = browser;
|
||||
this.page = page;
|
||||
|
||||
const target = page.target();
|
||||
this.targetId = target._targetId;
|
||||
|
||||
this.originSet = new Set();
|
||||
|
||||
this.addOrigin();
|
||||
|
||||
page.on("load", () => this.addOrigin());
|
||||
|
||||
this.shutdownWait = params.shutdownWait * 1000;
|
||||
|
||||
if (this.shutdownWait) {
|
||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||
console.log(`Shutting down in ${this.shutdownWait}ms if no ping received`);
|
||||
} else {
|
||||
this.shutdownTimer = 0;
|
||||
}
|
||||
|
||||
const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
|
||||
const port = 9223;
|
||||
httpServer.listen(port);
|
||||
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
|
||||
}
|
||||
|
||||
addOrigin() {
|
||||
const url = this.page.url();
|
||||
console.log("Adding origin for", url);
|
||||
if (url.startsWith("http:") || url.startsWith("https:")) {
|
||||
this.originSet.add(new URL(url).origin);
|
||||
}
|
||||
}
|
||||
|
||||
async handleRequest(req, res) {
|
||||
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
|
||||
const pathname = parsedUrl.pathname;
|
||||
if (pathname === "/") {
|
||||
let targetUrl;
|
||||
|
||||
switch (pathname) {
|
||||
case "/":
|
||||
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
||||
res.writeHead(200, {"Content-Type": "text/html"});
|
||||
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
|
||||
return;
|
||||
|
||||
} else if (pathname === "/createProfile" && req.method === "POST") {
|
||||
case "/ping":
|
||||
if (this.shutdownWait) {
|
||||
clearInterval(this.shutdownTimer);
|
||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||
console.log(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
|
||||
}
|
||||
res.writeHead(200, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({"pong": true}));
|
||||
return;
|
||||
|
||||
case "/target":
|
||||
res.writeHead(200, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({targetId: this.targetId}));
|
||||
return;
|
||||
|
||||
case "/createProfileJS":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
await createProfile(params, browser, page);
|
||||
|
||||
const buffers = [];
|
||||
|
||||
for await (const chunk of req) {
|
||||
buffers.push(chunk);
|
||||
}
|
||||
|
||||
const data = Buffer.concat(buffers).toString();
|
||||
|
||||
let targetFilename = "";
|
||||
|
||||
if (data.length) {
|
||||
try {
|
||||
targetFilename = JSON.parse(data).filename;
|
||||
} catch (e) {
|
||||
targetFilename = "";
|
||||
}
|
||||
}
|
||||
|
||||
console.log("target filename", targetFilename);
|
||||
|
||||
const resource = await createProfile(this.params, this.browser, this.page, targetFilename);
|
||||
const origins = Array.from(this.originSet.values());
|
||||
|
||||
res.writeHead(200, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({resource, origins}));
|
||||
} catch (e) {
|
||||
res.writeHead(500, {"Content-Type": "application/json"});
|
||||
res.end(JSON.stringify({"error": e.toString()}));
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
setTimeout(() => process.exit(0), 200);
|
||||
return;
|
||||
|
||||
case "/createProfile":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
await createProfile(this.params, this.browser, this.page);
|
||||
|
||||
res.writeHead(200, {"Content-Type": "text/html"});
|
||||
res.end("<html><body>Profile Created! You may now close this window.</body></html>");
|
||||
|
@ -263,17 +377,14 @@ async function handleInteractive(params, browser, page) {
|
|||
}
|
||||
|
||||
setTimeout(() => process.exit(0), 200);
|
||||
|
||||
} else {
|
||||
res.writeHead(404, {"Content-Type": "text/html"});
|
||||
res.end("Not Found");
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
const port = 9223;
|
||||
httpServer.listen(port);
|
||||
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
|
||||
res.writeHead(404, {"Content-Type": "text/html"});
|
||||
res.end("Not Found");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
main();
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.5.1",
|
||||
"version": "0.6.0-beta.0",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
|
@ -3,15 +3,16 @@ const fs = require("fs");
|
|||
const path = require("path");
|
||||
const os = require("os");
|
||||
const request = require("request");
|
||||
const { initStorage } = require("./storage");
|
||||
|
||||
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
|
||||
module.exports.loadProfile = async function(profileFilename) {
|
||||
const targetFilename = "/tmp/profile.tar.gz";
|
||||
|
||||
if (profileFilename &&
|
||||
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
||||
|
||||
const targetFilename = "/tmp/profile.tar.gz";
|
||||
|
||||
console.log(`Downloading ${profileFilename} to ${targetFilename}`);
|
||||
|
||||
const p = new Promise((resolve, reject) => {
|
||||
|
@ -23,6 +24,16 @@ module.exports.loadProfile = async function(profileFilename) {
|
|||
|
||||
await p;
|
||||
|
||||
profileFilename = targetFilename;
|
||||
} else if (profileFilename && profileFilename.startsWith("@")) {
|
||||
const storage = initStorage("");
|
||||
|
||||
if (!storage) {
|
||||
throw new Error("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||
}
|
||||
|
||||
await storage.downloadFile(profileFilename.slice(1), targetFilename);
|
||||
|
||||
profileFilename = targetFilename;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ const { initRedis } = require("./redis");
|
|||
// ===========================================================================
|
||||
class S3StorageSync
|
||||
{
|
||||
constructor(urlOrData, {filename, webhookUrl, userId, crawlId} = {}) {
|
||||
constructor(urlOrData, {filename, webhookUrl, userId, crawlId, prefix = ""} = {}) {
|
||||
let url;
|
||||
let accessKey;
|
||||
let secretKey;
|
||||
|
@ -54,20 +54,36 @@ class S3StorageSync
|
|||
this.crawlId = crawlId;
|
||||
this.webhookUrl = webhookUrl;
|
||||
|
||||
this.filenamePrefix = prefix;
|
||||
|
||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.]/g, ""));
|
||||
filename = filename.replace("@hostname", os.hostname());
|
||||
filename = filename.replace("@id", this.crawlId);
|
||||
|
||||
this.waczFilename = "data/" + filename;
|
||||
this.targetFilename = this.filenamePrefix + filename;
|
||||
}
|
||||
|
||||
async uploadCollWACZ(filename, completed = true) {
|
||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + this.waczFilename, filename);
|
||||
async uploadFile(srcFilename, targetFilename) {
|
||||
// allow overriding targetFilename
|
||||
if (targetFilename) {
|
||||
targetFilename = this.filenamePrefix + targetFilename;
|
||||
} else {
|
||||
targetFilename = this.targetFilename;
|
||||
}
|
||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
||||
|
||||
const finalHash = await checksumFile("sha256", filename);
|
||||
const finalHash = await checksumFile("sha256", srcFilename);
|
||||
|
||||
const size = await getFileSize(filename);
|
||||
const resource = {"path": this.waczFilename, "hash": finalHash, "bytes": size};
|
||||
const size = await getFileSize(srcFilename);
|
||||
return {"path": targetFilename, "hash": finalHash, "bytes": size};
|
||||
}
|
||||
|
||||
async downloadFile(srcFilename, destFilename) {
|
||||
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
||||
}
|
||||
|
||||
async uploadCollWACZ(srcFilename, completed = true) {
|
||||
const resource = await this.uploadFile(srcFilename, this.targetFilename);
|
||||
console.log(resource);
|
||||
|
||||
if (this.webhookUrl) {
|
||||
|
@ -76,7 +92,7 @@ class S3StorageSync
|
|||
user: this.userId,
|
||||
|
||||
//filename: `s3://${this.bucketName}/${this.objectPrefix}${this.waczFilename}`,
|
||||
filename: this.fullPrefix + this.waczFilename,
|
||||
filename: this.fullPrefix + this.targetFilename,
|
||||
|
||||
hash: resource.hash,
|
||||
size: resource.bytes,
|
||||
|
@ -100,6 +116,31 @@ class S3StorageSync
|
|||
}
|
||||
}
|
||||
|
||||
function initStorage(prefix = "") {
|
||||
if (!process.env.STORE_ENDPOINT_URL) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||
const storeInfo = {
|
||||
endpointUrl,
|
||||
accessKey: process.env.STORE_ACCESS_KEY,
|
||||
secretKey: process.env.STORE_SECRET_KEY,
|
||||
};
|
||||
|
||||
const opts = {
|
||||
crawlId: process.env.CRAWL_ID || os.hostname(),
|
||||
webhookUrl: process.env.WEBHOOK_URL,
|
||||
userId: process.env.STORE_USER,
|
||||
prefix,
|
||||
filename: process.env.STORE_FILENAME || "@ts-@id.wacz",
|
||||
};
|
||||
|
||||
console.log("Initing Storage...");
|
||||
return new S3StorageSync(storeInfo, opts);
|
||||
}
|
||||
|
||||
|
||||
async function getFileSize(filename) {
|
||||
const stats = await fsp.stat(filename);
|
||||
return stats.size;
|
||||
|
@ -117,5 +158,5 @@ function checksumFile(hashName, path) {
|
|||
|
||||
module.exports.S3StorageSync = S3StorageSync;
|
||||
module.exports.getFileSize = getFileSize;
|
||||
|
||||
module.exports.initStorage = initStorage;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue