Screencast Support for Debugging (fixes #43) (#52)

* screencast support (fixes #43):

- add NewWindowPage concurrency mode to support opening new window, and also reusing pages

- add --screencastPort cli options to enable screencasting, uses websockets to stream frames to client

- concurrency: add separate 'window' concurrency for opening new window per-page in same session, useful for screencasting with multiple workers but within same session

* add warning if using screencasting + more than one worker + page context, recommend 'window'

* cleanup: remove debug console, bump py-wacz dependency, improve close message

* README: add screencasting info to README
This commit is contained in:
Ilya Kreymer 2021-06-07 17:43:36 -07:00 committed by GitHub
parent e7d3767efb
commit ae4ce979fb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 4111 additions and 3757 deletions

View file

@ -41,6 +41,7 @@ RUN yarn install
ADD config.yaml /app/
ADD uwsgi.ini /app/
ADD *.js /app/
ADD screencast/ /app/screencast/
RUN ln -s /app/main.js /usr/bin/crawl
RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile

View file

@ -123,6 +123,10 @@ Options:
--profile Path to tar.gz file which will be
extracted and used as the browser
profile [string]
--screencastPort If set to a non-zero value, starts
an HTTP server with screencast
accessible on this port
[number] [default: 0]
```
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
@ -130,6 +134,7 @@ For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.co
The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example),
while `--waitUntil networkidle0` may make sense for dynamic sites.
### Behaviors
Browsertrix Crawler also supports automatically running customized in-browser behaviors. The behaviors auto-play videos (when possible),
@ -140,6 +145,30 @@ Behaviors to run can be specified via a comma-separated list passed to the `--be
See [Browsertrix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) for more info on all of the currently available behaviors.
### Watching the crawl -- Screencasting
With version 0.4.0, Browsertrix Crawler includes an experimental 'screencasting' option, which allows watching the crawl in real-time via screencast (connected via a websocket).
To enable, add `--screencastPort` command-line option and also map the port on the docker container. An example command might be:
```
docker-compose run -p 9037:9037 crawler crawl --url [URL] --screencastPort 9037
```
Then, you can open `http://localhost:9037/` and watch the crawl.
Note: If specifying multiple workers, the crawler should additional be instructed to open each one in a new window, otherwise the screencasting can only update one page at a time.
For example,
```
docker-compose run -p 9037:9037 crawler crawl --url [URL] --screencastPort 9037 --newContext window --workers 3
```
will start a crawl with 3 workers, and show the screen of each of the workers from `http://localhost:9037/`.
## Creating and Using Browser Profiles
Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in

View file

@ -30,6 +30,8 @@ const HTTPS_AGENT = require("https").Agent({
const HTTP_AGENT = require("http").Agent();
const { ScreenCaster, NewWindowPage } = require("./screencaster");
// ============================================================================
class Crawler {
@ -176,7 +178,7 @@ class Crawler {
},
"newContext": {
describe: "The context for each new capture, can be a new: page, session or browser.",
describe: "The context for each new capture, can be a new: page, window, session or browser.",
default: "page",
type: "string"
},
@ -318,6 +320,12 @@ class Crawler {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
},
"screencastPort": {
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
type: "number",
default: 0
}
};
}
@ -385,6 +393,9 @@ class Crawler {
switch (argv.newContext) {
case "page":
argv.newContext = Cluster.CONCURRENCY_PAGE;
if (argv.screencastPort && argv.workers > 1) {
console.warn("Note: Screencast with >1 workers and default page context may only show one page at a time. To fix, add '--newContext window' to open each page in a new window");
}
break;
case "session":
@ -395,6 +406,10 @@ class Crawler {
argv.newContext = Cluster.CONCURRENCY_BROWSER;
break;
case "window":
argv.newContext = NewWindowPage;
break;
default:
throw new Error("Invalid newContext, must be one of: page, session, browser");
}
@ -537,13 +552,18 @@ class Crawler {
async crawlPage({page, data}) {
try {
if (this.screencaster) {
await this.screencaster.newTarget(page.target());
}
if (this.emulateDevice) {
await page.emulate(this.emulateDevice);
}
if (this.behaviorOpts) {
if (this.behaviorOpts && !page.__bx_inited) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.behaviorOpts});`);
page.__bx_inited = true;
}
// run custom driver here
@ -566,6 +586,10 @@ class Crawler {
await this.writeStats();
if (this.screencaster) {
await this.screencaster.endTarget(page.target());
}
} catch (e) {
console.warn(e);
}
@ -617,6 +641,10 @@ class Crawler {
await this.initPages();
if (this.params.screencastPort) {
this.screencaster = new ScreenCaster(this.cluster, this.params.screencastPort);
}
if (this.params.urlFile) {
const urlSeedFile = await fsp.readFile(this.params.urlFile, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");
@ -627,6 +655,7 @@ class Crawler {
if (!this.params.urlFile) {
this.queueUrl(this.params.url);
}
if (this.params.useSitemap) {
await this.parseSitemap(this.params.useSitemap);
}

View file

@ -14,9 +14,10 @@
"ioredis": "^4.27.1",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1",
"puppeteer-core": "^8.0.0",
"sitemapper": "^3.1.2",
"uuid": "8.3.2",
"ws": "^7.4.4",
"yargs": "^16.0.3"
},
"devDependencies": {

View file

@ -1,4 +1,4 @@
#pywb>=2.5.0
git+https://github.com/webrecorder/pywb@main
uwsgi
wacz>=0.2.1
wacz>=0.3.0

76
screencast/index.html Normal file
View file

@ -0,0 +1,76 @@
<!doctype html>
<html>
<head>
<style>
#content {
display: flex;
flex-direction: row;
flex-wrap: wrap;
}
#content img {
width: 640px;
height: 480px;
margin: 2rem;
}
</style>
<script>
const ws = new WebSocket(window.location.origin.replace("http", "ws") + "/ws");
ws.addEventListener("message", (event) => handleMessage(event.data));
const unusedElems = [];
function handleMessage(resp) {
resp = JSON.parse(resp);
switch (resp.msg) {
case "newTarget":
case "screencast":
img = createImage(resp.id);
if (resp.data) {
setImageData(img, resp.data);
}
break;
case "endTarget":
img = unuseImage(resp.id);
break;
}
}
function setImageData(img, data) {
//img.style.display = "";
img.src = "data:image/png;base64," + data;
}
function createImage(id) {
let elem = document.getElementById(id);
if (elem) {
return elem;
}
if (unusedElems.length) {
elem = unusedElems.shift();
elem.setAttribute("id", id);
return elem;
}
elem = document.createElement("img");
elem.setAttribute("id", id);
document.getElementById("content").appendChild(elem);
return elem;
}
function unuseImage(id) {
const elem = document.getElementById(id);
if (!elem) {
return;
}
//elem.style.display = "none";
unusedElems.push(elem);
}
</script>
<head>
<body>
<div id="content">
</div>
</body>

227
screencaster.js Normal file
View file

@ -0,0 +1,227 @@
const ws = require("ws");
const http = require("http");
const url = require("url");
const fs = require("fs");
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
const indexHTML = fs.readFileSync("/app/screencast/index.html", {encoding: "utf8"});
// ===========================================================================
class ScreenCaster
{
constructor(cluster, port) {
this.cluster = cluster;
this.httpServer = http.createServer((req, res) => {
const pathname = url.parse(req.url).pathname;
if (pathname === "/") {
res.writeHead(200, {"Content-Type": "text/html"});
res.end(indexHTML);
} else {
res.writeHead(404, {"Content-Type": "text/html"});
res.end("Not Found");
}
});
this.allWS = new Set();
this.targets = new Map();
this.caches = new Map();
this.urls = new Map();
this.wss = new ws.Server({ noServer: true });
this.wss.on("connection", (ws) => this.initWebSocket(ws));
this.httpServer.on("upgrade", (request, socket, head) => {
const pathname = url.parse(request.url).pathname;
if (pathname === "/ws") {
this.wss.handleUpgrade(request, socket, head, (ws) => {
this.wss.emit("connection", ws, request);
});
}
});
this.httpServer.listen(port);
console.log(`Screencast Server started on: ${port}`);
}
initWebSocket(ws) {
for (const id of this.targets.keys()) {
const data = this.caches.get(id);
const url = this.urls.get(id);
const msg = {"msg": "newTarget", id, url, data};
ws.send(JSON.stringify(msg));
}
this.allWS.add(ws);
if (this.allWS.size === 1) {
this.startCastAll();
}
ws.on("close", () => {
console.log("Screencast WebSocket Disconnected");
this.allWS.delete(ws);
if (this.allWS.size === 0) {
this.stopCastAll();
}
});
}
sendAll(msg) {
msg = JSON.stringify(msg);
for (const ws of this.allWS) {
ws.send(msg);
}
}
async newTarget(target) {
const cdp = await target.createCDPSession();
const id = target._targetId;
const url = target.url();
this.targets.set(id, cdp);
this.urls.set(id, url);
this.sendAll({"msg": "newTarget", id, url});
cdp.on("Page.screencastFrame", async (resp) => {
const data = resp.data;
const sessionId = resp.sessionId;
this.sendAll({"msg": "screencast", id, data});
this.caches.set(id, data);
await cdp.send("Page.screencastFrameAck", {sessionId});
});
if (this.allWS.size) {
await this.startCast(cdp);
}
}
async endTarget(target) {
const id = target._targetId;
const cdp = this.targets.get(id);
if (!cdp) {
return;
}
await this.stopCast(cdp);
this.sendAll({"msg": "endTarget", id});
this.targets.delete(id);
this.caches.delete(id);
this.urls.delete(id);
await cdp.detach();
}
async startCast(cdp) {
if (cdp._startedCast) {
return;
}
cdp._startedCast = true;
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: 1024, maxHeight: 768});
}
async stopCast(cdp) {
if (!cdp._startedCast) {
return;
}
cdp._startedCast = false;
await cdp.send("Page.stopScreencast");
}
startCastAll() {
const promises = [];
for (const cdp of this.targets.values()) {
promises.push(this.startCast(cdp));
}
return Promise.allSettled(promises);
}
stopCastAll() {
const promises = [];
for (const cdp of this.targets.values()) {
promises.push(this.stopCast(cdp));
}
return Promise.allSettled(promises);
}
}
// ===========================================================================
class NewWindowPage extends SingleBrowserImplementation {
async init() {
await super.init();
this.newTargets = [];
this.nextPromise();
this.mainPage = await this.browser.newPage();
this.pages = [];
this.reuse = true;
await this.mainPage.goto("about:blank");
this.mainTarget = this.mainPage.target();
this.browser.on("targetcreated", (target) => {
if (this._nextTarget && target.opener() === this.mainTarget) {
this.newTargets.push(target);
this._nextTarget();
this.nextPromise();
}
});
}
nextPromise() {
this._nextPromise = new Promise((resolve) => this._nextTarget = resolve);
}
async getNewPage() {
const p = this._nextPromise;
await this.mainPage.evaluate("window.open('about:blank', '', 'resizable');");
await p;
const target = this.newTargets.shift();
return {page: await target.page() };
}
async createResources() {
if (this.pages.length) {
return {page: this.pages.shift()};
}
return await this.getNewPage();
}
async freeResources(resources) {
if (this.reuse) {
this.pages.push(resources.page);
} else {
await resources.page.close();
}
}
}
module.exports = { ScreenCaster, NewWindowPage };

7495
yarn.lock

File diff suppressed because it is too large Load diff