2022-10-24 15:30:10 +02:00
|
|
|
import ws from "ws";
|
|
|
|
import http from "http";
|
|
|
|
import url from "url";
|
|
|
|
import fs from "fs";
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
import { initRedis } from "./redis.js";
|
2022-02-23 12:09:48 -08:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
2021-06-07 17:43:36 -07:00
|
|
|
|
|
|
|
|
|
|
|
// ===========================================================================
|
2022-02-23 12:09:48 -08:00
|
|
|
class WSTransport
|
2021-06-07 17:43:36 -07:00
|
|
|
{
|
2022-02-23 12:09:48 -08:00
|
|
|
constructor(port) {
|
2021-06-07 17:43:36 -07:00
|
|
|
this.allWS = new Set();
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
this.caster = null;
|
2021-06-07 17:43:36 -07:00
|
|
|
|
|
|
|
this.wss = new ws.Server({ noServer: true });
|
|
|
|
|
|
|
|
this.wss.on("connection", (ws) => this.initWebSocket(ws));
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
this.httpServer = http.createServer((...args) => this.handleRequest(...args));
|
2021-06-07 17:43:36 -07:00
|
|
|
this.httpServer.on("upgrade", (request, socket, head) => {
|
|
|
|
const pathname = url.parse(request.url).pathname;
|
|
|
|
|
|
|
|
if (pathname === "/ws") {
|
|
|
|
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
|
|
|
this.wss.emit("connection", ws, request);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.httpServer.listen(port);
|
|
|
|
}
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
async handleRequest(req, res) {
|
|
|
|
const pathname = url.parse(req.url).pathname;
|
|
|
|
switch (pathname) {
|
|
|
|
case "/":
|
|
|
|
res.writeHead(200, {"Content-Type": "text/html"});
|
|
|
|
res.end(indexHTML);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
res.writeHead(404, {"Content-Type": "text/html"});
|
|
|
|
res.end("Not Found");
|
|
|
|
}
|
|
|
|
|
2021-06-07 17:43:36 -07:00
|
|
|
initWebSocket(ws) {
|
2022-02-23 12:09:48 -08:00
|
|
|
for (const packet of this.caster.iterCachedData()) {
|
|
|
|
ws.send(JSON.stringify(packet));
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
this.allWS.add(ws);
|
|
|
|
|
|
|
|
if (this.allWS.size === 1) {
|
2022-02-23 12:09:48 -08:00
|
|
|
this.caster.startCastAll();
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
ws.on("close", () => {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
//console.log("Screencast WebSocket Disconnected");
|
2021-06-07 17:43:36 -07:00
|
|
|
this.allWS.delete(ws);
|
|
|
|
|
|
|
|
if (this.allWS.size === 0) {
|
2022-02-23 12:09:48 -08:00
|
|
|
this.caster.stopCastAll();
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
sendAll(packet) {
|
|
|
|
packet = JSON.stringify(packet);
|
2021-06-07 17:43:36 -07:00
|
|
|
for (const ws of this.allWS) {
|
2022-02-23 12:09:48 -08:00
|
|
|
ws.send(packet);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
isActive() {
|
|
|
|
return this.allWS.size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===========================================================================
|
|
|
|
class RedisPubSubTransport
|
|
|
|
{
|
|
|
|
constructor(redisUrl, crawlId) {
|
|
|
|
this.numConnections = 0;
|
|
|
|
this.castChannel = `c:${crawlId}:cast`;
|
|
|
|
this.ctrlChannel = `c:${crawlId}:ctrl`;
|
|
|
|
|
|
|
|
this.init(redisUrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
async init(redisUrl) {
|
|
|
|
this.redis = await initRedis(redisUrl);
|
|
|
|
|
|
|
|
const subRedis = await initRedis(redisUrl);
|
|
|
|
|
|
|
|
await subRedis.subscribe(this.ctrlChannel);
|
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
subRedis.on("message", async (channel, message) => {
|
2022-02-23 12:09:48 -08:00
|
|
|
if (channel !== this.ctrlChannel) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (message) {
|
|
|
|
case "connect":
|
|
|
|
this.numConnections++;
|
|
|
|
if (this.numConnections === 1) {
|
|
|
|
this.caster.startCastAll();
|
|
|
|
} else {
|
|
|
|
for (const packet of this.caster.iterCachedData()) {
|
2022-03-02 13:26:11 -08:00
|
|
|
await this.sendAll(packet);
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "disconnect":
|
|
|
|
this.numConnections--;
|
|
|
|
if (this.numConnections === 0) {
|
2022-03-02 13:26:11 -08:00
|
|
|
this.caster.stopCastAll();
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
async sendAll(packet) {
|
|
|
|
await this.redis.publish(this.castChannel, JSON.stringify(packet));
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
async isActive() {
|
|
|
|
const result = await this.redis.pubsub("numsub", this.castChannel);
|
|
|
|
return (result.length > 1 ? result[1] > 0: false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===========================================================================
|
|
|
|
class ScreenCaster
|
|
|
|
{
|
2022-03-02 13:26:11 -08:00
|
|
|
constructor(transport, numWorkers) {
|
2022-02-23 12:09:48 -08:00
|
|
|
this.transport = transport;
|
|
|
|
this.transport.caster = this;
|
|
|
|
|
|
|
|
this.caches = new Map();
|
|
|
|
this.urls = new Map();
|
|
|
|
|
|
|
|
this.targets = new Map();
|
2022-02-23 14:39:33 -08:00
|
|
|
|
|
|
|
// todo: make customizable
|
|
|
|
this.maxWidth = 640;
|
|
|
|
this.maxHeight = 480;
|
2022-03-02 13:26:11 -08:00
|
|
|
|
|
|
|
this.initMsg = {
|
|
|
|
msg: "init",
|
|
|
|
width: this.maxWidth,
|
|
|
|
height: this.maxHeight,
|
|
|
|
browsers: numWorkers
|
|
|
|
};
|
2022-02-23 12:09:48 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
*iterCachedData() {
|
2022-03-02 13:26:11 -08:00
|
|
|
yield this.initMsg;
|
2022-02-23 12:09:48 -08:00
|
|
|
const msg = "screencast";
|
|
|
|
for (const id of this.caches.keys()) {
|
|
|
|
const data = this.caches.get(id);
|
|
|
|
const url = this.urls.get(id);
|
|
|
|
yield {msg, id, url, data};
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
detectClose(target) {
|
|
|
|
const context = target.browserContext();
|
2022-02-23 12:09:48 -08:00
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
if (context.__destroy_added) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
context.on("targetdestroyed", (target) => {
|
|
|
|
this.endTarget(target);
|
|
|
|
});
|
|
|
|
|
|
|
|
context.__destroy_added = true;
|
|
|
|
}
|
|
|
|
|
2022-07-08 17:17:46 -07:00
|
|
|
async screencastTarget(target, currUrl) {
|
2021-06-07 17:43:36 -07:00
|
|
|
const id = target._targetId;
|
|
|
|
|
2022-07-08 17:17:46 -07:00
|
|
|
this.urls.set(id, currUrl);
|
|
|
|
|
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds
* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request
* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed
* bump to 0.6.0
2022-06-17 11:58:44 -07:00
|
|
|
if (this.targets.has(id)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.detectClose(target);
|
|
|
|
|
|
|
|
const cdp = await target.createCDPSession();
|
|
|
|
|
2021-06-07 17:43:36 -07:00
|
|
|
this.targets.set(id, cdp);
|
2022-07-08 17:17:46 -07:00
|
|
|
//this.urls.set(id, target.url());
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
const msg = "screencast";
|
2021-06-07 17:43:36 -07:00
|
|
|
|
|
|
|
cdp.on("Page.screencastFrame", async (resp) => {
|
|
|
|
const data = resp.data;
|
|
|
|
const sessionId = resp.sessionId;
|
2022-02-23 14:39:33 -08:00
|
|
|
const url = target.url();
|
2021-06-07 17:43:36 -07:00
|
|
|
|
|
|
|
this.caches.set(id, data);
|
2022-02-23 14:39:33 -08:00
|
|
|
this.urls.set(id, url);
|
2022-02-23 12:09:48 -08:00
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
//if (url !== "about:blank") {
|
|
|
|
await this.transport.sendAll({msg, id, data, url});
|
|
|
|
//}
|
2022-02-23 12:09:48 -08:00
|
|
|
|
2021-07-20 15:45:51 -07:00
|
|
|
try {
|
|
|
|
await cdp.send("Page.screencastFrameAck", {sessionId});
|
|
|
|
} catch(e) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
//console.log("Ack Failed, probably window/tab already closed", e);
|
2021-07-20 15:45:51 -07:00
|
|
|
}
|
2021-06-07 17:43:36 -07:00
|
|
|
});
|
|
|
|
|
2022-02-23 12:09:48 -08:00
|
|
|
if (await this.transport.isActive()) {
|
2021-06-07 17:43:36 -07:00
|
|
|
await this.startCast(cdp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-19 09:23:40 -07:00
|
|
|
async endAllTargets() {
|
|
|
|
const targetIds = this.targets.keys();
|
|
|
|
|
|
|
|
for (const key of targetIds) {
|
|
|
|
await this.endTargetById(key);
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
2022-08-19 09:23:40 -07:00
|
|
|
}
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2022-08-19 09:23:40 -07:00
|
|
|
async endTarget(target) {
|
|
|
|
await this.endTargetById(target._targetId);
|
|
|
|
}
|
2021-06-07 17:43:36 -07:00
|
|
|
|
2022-08-19 09:23:40 -07:00
|
|
|
async endTargetById(id) {
|
2021-06-07 17:43:36 -07:00
|
|
|
this.caches.delete(id);
|
|
|
|
this.urls.delete(id);
|
|
|
|
|
2022-08-19 09:23:40 -07:00
|
|
|
const cdp = this.targets.get(id);
|
|
|
|
|
|
|
|
if (cdp) {
|
|
|
|
try {
|
|
|
|
await this.stopCast(cdp);
|
|
|
|
await cdp.detach();
|
|
|
|
} catch (e) {
|
|
|
|
// already detached
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
await this.transport.sendAll({msg: "close", id});
|
2022-02-23 12:09:48 -08:00
|
|
|
|
|
|
|
this.targets.delete(id);
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
async startCast(cdp) {
|
|
|
|
if (cdp._startedCast) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
cdp._startedCast = true;
|
|
|
|
|
2022-03-02 13:26:11 -08:00
|
|
|
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 2, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
async stopCast(cdp) {
|
|
|
|
if (!cdp._startedCast) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
cdp._startedCast = false;
|
2021-07-20 15:45:51 -07:00
|
|
|
try {
|
|
|
|
await cdp.send("Page.stopScreencast");
|
|
|
|
} catch (e) {
|
|
|
|
// likely already stopped
|
|
|
|
}
|
2021-06-07 17:43:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
startCastAll() {
|
|
|
|
const promises = [];
|
|
|
|
|
|
|
|
for (const cdp of this.targets.values()) {
|
|
|
|
promises.push(this.startCast(cdp));
|
|
|
|
}
|
|
|
|
|
|
|
|
return Promise.allSettled(promises);
|
|
|
|
}
|
|
|
|
|
|
|
|
stopCastAll() {
|
|
|
|
const promises = [];
|
|
|
|
|
|
|
|
for (const cdp of this.targets.values()) {
|
|
|
|
promises.push(this.stopCast(cdp));
|
|
|
|
}
|
|
|
|
|
|
|
|
return Promise.allSettled(promises);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export { ScreenCaster, WSTransport, RedisPubSubTransport };
|