mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
* screencast support (fixes #43): - add NewWindowPage concurrency mode to support opening new window, and also reusing pages - add --screencastPort cli options to enable screencasting, uses websockets to stream frames to client - concurrency: add separate 'window' concurrency for opening new window per-page in same session, useful for screencasting with multiple workers but within same session * add warning if using screencasting + more than one worker + page context, recommend 'window' * cleanup: remove debug console, bump py-wacz dependency, improve close message * README: add screencasting info to README
This commit is contained in:
parent
e7d3767efb
commit
ae4ce979fb
8 changed files with 4111 additions and 3757 deletions
|
@ -41,6 +41,7 @@ RUN yarn install
|
|||
ADD config.yaml /app/
|
||||
ADD uwsgi.ini /app/
|
||||
ADD *.js /app/
|
||||
ADD screencast/ /app/screencast/
|
||||
|
||||
RUN ln -s /app/main.js /usr/bin/crawl
|
||||
RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile
|
||||
|
|
29
README.md
29
README.md
|
@ -123,6 +123,10 @@ Options:
|
|||
--profile Path to tar.gz file which will be
|
||||
extracted and used as the browser
|
||||
profile [string]
|
||||
--screencastPort If set to a non-zero value, starts
|
||||
an HTTP server with screencast
|
||||
accessible on this port
|
||||
[number] [default: 0]
|
||||
```
|
||||
|
||||
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
|
||||
|
@ -130,6 +134,7 @@ For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.co
|
|||
The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example),
|
||||
while `--waitUntil networkidle0` may make sense for dynamic sites.
|
||||
|
||||
|
||||
### Behaviors
|
||||
|
||||
Browsertrix Crawler also supports automatically running customized in-browser behaviors. The behaviors auto-play videos (when possible),
|
||||
|
@ -140,6 +145,30 @@ Behaviors to run can be specified via a comma-separated list passed to the `--be
|
|||
|
||||
See [Browsertrix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) for more info on all of the currently available behaviors.
|
||||
|
||||
|
||||
### Watching the crawl -- Screencasting
|
||||
|
||||
With version 0.4.0, Browsertrix Crawler includes an experimental 'screencasting' option, which allows watching the crawl in real-time via screencast (connected via a websocket).
|
||||
|
||||
To enable, add `--screencastPort` command-line option and also map the port on the docker container. An example command might be:
|
||||
|
||||
```
|
||||
docker-compose run -p 9037:9037 crawler crawl --url [URL] --screencastPort 9037
|
||||
```
|
||||
|
||||
Then, you can open `http://localhost:9037/` and watch the crawl.
|
||||
|
||||
Note: If specifying multiple workers, the crawler should additional be instructed to open each one in a new window, otherwise the screencasting can only update one page at a time.
|
||||
|
||||
For example,
|
||||
|
||||
```
|
||||
docker-compose run -p 9037:9037 crawler crawl --url [URL] --screencastPort 9037 --newContext window --workers 3
|
||||
```
|
||||
|
||||
will start a crawl with 3 workers, and show the screen of each of the workers from `http://localhost:9037/`.
|
||||
|
||||
|
||||
## Creating and Using Browser Profiles
|
||||
|
||||
Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in
|
||||
|
|
33
crawler.js
33
crawler.js
|
@ -30,6 +30,8 @@ const HTTPS_AGENT = require("https").Agent({
|
|||
|
||||
const HTTP_AGENT = require("http").Agent();
|
||||
|
||||
const { ScreenCaster, NewWindowPage } = require("./screencaster");
|
||||
|
||||
|
||||
// ============================================================================
|
||||
class Crawler {
|
||||
|
@ -176,7 +178,7 @@ class Crawler {
|
|||
},
|
||||
|
||||
"newContext": {
|
||||
describe: "The context for each new capture, can be a new: page, session or browser.",
|
||||
describe: "The context for each new capture, can be a new: page, window, session or browser.",
|
||||
default: "page",
|
||||
type: "string"
|
||||
},
|
||||
|
@ -318,6 +320,12 @@ class Crawler {
|
|||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"screencastPort": {
|
||||
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
|
||||
type: "number",
|
||||
default: 0
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -385,6 +393,9 @@ class Crawler {
|
|||
switch (argv.newContext) {
|
||||
case "page":
|
||||
argv.newContext = Cluster.CONCURRENCY_PAGE;
|
||||
if (argv.screencastPort && argv.workers > 1) {
|
||||
console.warn("Note: Screencast with >1 workers and default page context may only show one page at a time. To fix, add '--newContext window' to open each page in a new window");
|
||||
}
|
||||
break;
|
||||
|
||||
case "session":
|
||||
|
@ -395,6 +406,10 @@ class Crawler {
|
|||
argv.newContext = Cluster.CONCURRENCY_BROWSER;
|
||||
break;
|
||||
|
||||
case "window":
|
||||
argv.newContext = NewWindowPage;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error("Invalid newContext, must be one of: page, session, browser");
|
||||
}
|
||||
|
@ -537,13 +552,18 @@ class Crawler {
|
|||
|
||||
async crawlPage({page, data}) {
|
||||
try {
|
||||
if (this.screencaster) {
|
||||
await this.screencaster.newTarget(page.target());
|
||||
}
|
||||
|
||||
if (this.emulateDevice) {
|
||||
await page.emulate(this.emulateDevice);
|
||||
}
|
||||
|
||||
if (this.behaviorOpts) {
|
||||
if (this.behaviorOpts && !page.__bx_inited) {
|
||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
||||
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.behaviorOpts});`);
|
||||
page.__bx_inited = true;
|
||||
}
|
||||
|
||||
// run custom driver here
|
||||
|
@ -566,6 +586,10 @@ class Crawler {
|
|||
|
||||
await this.writeStats();
|
||||
|
||||
if (this.screencaster) {
|
||||
await this.screencaster.endTarget(page.target());
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
}
|
||||
|
@ -617,6 +641,10 @@ class Crawler {
|
|||
|
||||
await this.initPages();
|
||||
|
||||
if (this.params.screencastPort) {
|
||||
this.screencaster = new ScreenCaster(this.cluster, this.params.screencastPort);
|
||||
}
|
||||
|
||||
if (this.params.urlFile) {
|
||||
const urlSeedFile = await fsp.readFile(this.params.urlFile, "utf8");
|
||||
const urlSeedFileList = urlSeedFile.split("\n");
|
||||
|
@ -627,6 +655,7 @@ class Crawler {
|
|||
if (!this.params.urlFile) {
|
||||
this.queueUrl(this.params.url);
|
||||
}
|
||||
|
||||
if (this.params.useSitemap) {
|
||||
await this.parseSitemap(this.params.useSitemap);
|
||||
}
|
||||
|
|
|
@ -14,9 +14,10 @@
|
|||
"ioredis": "^4.27.1",
|
||||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "^0.22.0",
|
||||
"puppeteer-core": "^5.3.1",
|
||||
"puppeteer-core": "^8.0.0",
|
||||
"sitemapper": "^3.1.2",
|
||||
"uuid": "8.3.2",
|
||||
"ws": "^7.4.4",
|
||||
"yargs": "^16.0.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#pywb>=2.5.0
|
||||
git+https://github.com/webrecorder/pywb@main
|
||||
uwsgi
|
||||
wacz>=0.2.1
|
||||
wacz>=0.3.0
|
||||
|
|
76
screencast/index.html
Normal file
76
screencast/index.html
Normal file
|
@ -0,0 +1,76 @@
|
|||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
#content {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
#content img {
|
||||
width: 640px;
|
||||
height: 480px;
|
||||
margin: 2rem;
|
||||
}
|
||||
</style>
|
||||
<script>
|
||||
const ws = new WebSocket(window.location.origin.replace("http", "ws") + "/ws");
|
||||
ws.addEventListener("message", (event) => handleMessage(event.data));
|
||||
|
||||
const unusedElems = [];
|
||||
|
||||
function handleMessage(resp) {
|
||||
resp = JSON.parse(resp);
|
||||
|
||||
switch (resp.msg) {
|
||||
case "newTarget":
|
||||
case "screencast":
|
||||
img = createImage(resp.id);
|
||||
if (resp.data) {
|
||||
setImageData(img, resp.data);
|
||||
}
|
||||
break;
|
||||
|
||||
case "endTarget":
|
||||
img = unuseImage(resp.id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
function setImageData(img, data) {
|
||||
//img.style.display = "";
|
||||
img.src = "data:image/png;base64," + data;
|
||||
}
|
||||
|
||||
function createImage(id) {
|
||||
let elem = document.getElementById(id);
|
||||
if (elem) {
|
||||
return elem;
|
||||
}
|
||||
|
||||
if (unusedElems.length) {
|
||||
elem = unusedElems.shift();
|
||||
elem.setAttribute("id", id);
|
||||
return elem;
|
||||
}
|
||||
|
||||
elem = document.createElement("img");
|
||||
elem.setAttribute("id", id);
|
||||
document.getElementById("content").appendChild(elem);
|
||||
return elem;
|
||||
}
|
||||
|
||||
function unuseImage(id) {
|
||||
const elem = document.getElementById(id);
|
||||
if (!elem) {
|
||||
return;
|
||||
}
|
||||
//elem.style.display = "none";
|
||||
unusedElems.push(elem);
|
||||
}
|
||||
</script>
|
||||
<head>
|
||||
<body>
|
||||
<div id="content">
|
||||
</div>
|
||||
</body>
|
227
screencaster.js
Normal file
227
screencaster.js
Normal file
|
@ -0,0 +1,227 @@
|
|||
const ws = require("ws");
|
||||
const http = require("http");
|
||||
const url = require("url");
|
||||
const fs = require("fs");
|
||||
|
||||
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
|
||||
|
||||
const indexHTML = fs.readFileSync("/app/screencast/index.html", {encoding: "utf8"});
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class ScreenCaster
|
||||
{
|
||||
constructor(cluster, port) {
|
||||
this.cluster = cluster;
|
||||
|
||||
this.httpServer = http.createServer((req, res) => {
|
||||
const pathname = url.parse(req.url).pathname;
|
||||
if (pathname === "/") {
|
||||
res.writeHead(200, {"Content-Type": "text/html"});
|
||||
res.end(indexHTML);
|
||||
} else {
|
||||
res.writeHead(404, {"Content-Type": "text/html"});
|
||||
res.end("Not Found");
|
||||
}
|
||||
});
|
||||
|
||||
this.allWS = new Set();
|
||||
|
||||
this.targets = new Map();
|
||||
this.caches = new Map();
|
||||
this.urls = new Map();
|
||||
|
||||
this.wss = new ws.Server({ noServer: true });
|
||||
|
||||
this.wss.on("connection", (ws) => this.initWebSocket(ws));
|
||||
|
||||
this.httpServer.on("upgrade", (request, socket, head) => {
|
||||
const pathname = url.parse(request.url).pathname;
|
||||
|
||||
if (pathname === "/ws") {
|
||||
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
||||
this.wss.emit("connection", ws, request);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
this.httpServer.listen(port);
|
||||
console.log(`Screencast Server started on: ${port}`);
|
||||
}
|
||||
|
||||
initWebSocket(ws) {
|
||||
for (const id of this.targets.keys()) {
|
||||
const data = this.caches.get(id);
|
||||
const url = this.urls.get(id);
|
||||
const msg = {"msg": "newTarget", id, url, data};
|
||||
ws.send(JSON.stringify(msg));
|
||||
}
|
||||
|
||||
this.allWS.add(ws);
|
||||
|
||||
if (this.allWS.size === 1) {
|
||||
this.startCastAll();
|
||||
}
|
||||
|
||||
ws.on("close", () => {
|
||||
console.log("Screencast WebSocket Disconnected");
|
||||
this.allWS.delete(ws);
|
||||
|
||||
if (this.allWS.size === 0) {
|
||||
this.stopCastAll();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
sendAll(msg) {
|
||||
msg = JSON.stringify(msg);
|
||||
for (const ws of this.allWS) {
|
||||
ws.send(msg);
|
||||
}
|
||||
}
|
||||
|
||||
async newTarget(target) {
|
||||
const cdp = await target.createCDPSession();
|
||||
const id = target._targetId;
|
||||
const url = target.url();
|
||||
|
||||
this.targets.set(id, cdp);
|
||||
this.urls.set(id, url);
|
||||
|
||||
this.sendAll({"msg": "newTarget", id, url});
|
||||
|
||||
cdp.on("Page.screencastFrame", async (resp) => {
|
||||
const data = resp.data;
|
||||
const sessionId = resp.sessionId;
|
||||
|
||||
this.sendAll({"msg": "screencast", id, data});
|
||||
this.caches.set(id, data);
|
||||
await cdp.send("Page.screencastFrameAck", {sessionId});
|
||||
});
|
||||
|
||||
if (this.allWS.size) {
|
||||
await this.startCast(cdp);
|
||||
}
|
||||
}
|
||||
|
||||
async endTarget(target) {
|
||||
const id = target._targetId;
|
||||
const cdp = this.targets.get(id);
|
||||
if (!cdp) {
|
||||
return;
|
||||
}
|
||||
|
||||
await this.stopCast(cdp);
|
||||
|
||||
this.sendAll({"msg": "endTarget", id});
|
||||
|
||||
this.targets.delete(id);
|
||||
this.caches.delete(id);
|
||||
this.urls.delete(id);
|
||||
|
||||
await cdp.detach();
|
||||
}
|
||||
|
||||
async startCast(cdp) {
|
||||
if (cdp._startedCast) {
|
||||
return;
|
||||
}
|
||||
|
||||
cdp._startedCast = true;
|
||||
|
||||
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: 1024, maxHeight: 768});
|
||||
}
|
||||
|
||||
async stopCast(cdp) {
|
||||
if (!cdp._startedCast) {
|
||||
return;
|
||||
}
|
||||
|
||||
cdp._startedCast = false;
|
||||
await cdp.send("Page.stopScreencast");
|
||||
}
|
||||
|
||||
startCastAll() {
|
||||
const promises = [];
|
||||
|
||||
for (const cdp of this.targets.values()) {
|
||||
promises.push(this.startCast(cdp));
|
||||
}
|
||||
|
||||
return Promise.allSettled(promises);
|
||||
}
|
||||
|
||||
stopCastAll() {
|
||||
const promises = [];
|
||||
|
||||
for (const cdp of this.targets.values()) {
|
||||
promises.push(this.stopCast(cdp));
|
||||
}
|
||||
|
||||
return Promise.allSettled(promises);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class NewWindowPage extends SingleBrowserImplementation {
|
||||
async init() {
|
||||
await super.init();
|
||||
|
||||
this.newTargets = [];
|
||||
|
||||
this.nextPromise();
|
||||
|
||||
this.mainPage = await this.browser.newPage();
|
||||
|
||||
this.pages = [];
|
||||
this.reuse = true;
|
||||
|
||||
await this.mainPage.goto("about:blank");
|
||||
|
||||
this.mainTarget = this.mainPage.target();
|
||||
|
||||
this.browser.on("targetcreated", (target) => {
|
||||
if (this._nextTarget && target.opener() === this.mainTarget) {
|
||||
this.newTargets.push(target);
|
||||
this._nextTarget();
|
||||
this.nextPromise();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
nextPromise() {
|
||||
this._nextPromise = new Promise((resolve) => this._nextTarget = resolve);
|
||||
}
|
||||
|
||||
async getNewPage() {
|
||||
const p = this._nextPromise;
|
||||
|
||||
await this.mainPage.evaluate("window.open('about:blank', '', 'resizable');");
|
||||
|
||||
await p;
|
||||
|
||||
const target = this.newTargets.shift();
|
||||
|
||||
return {page: await target.page() };
|
||||
}
|
||||
|
||||
async createResources() {
|
||||
if (this.pages.length) {
|
||||
return {page: this.pages.shift()};
|
||||
}
|
||||
return await this.getNewPage();
|
||||
}
|
||||
|
||||
async freeResources(resources) {
|
||||
if (this.reuse) {
|
||||
this.pages.push(resources.page);
|
||||
} else {
|
||||
await resources.page.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
module.exports = { ScreenCaster, NewWindowPage };
|
Loading…
Add table
Add a link
Reference in a new issue