Page-reuse concurrency + Browser Repair + Screencaster Cleanup Improvements (#157)

* new window: use cdp instead of window.open

* new window tweaks: add reuseCount, use browser.target() instead of opening a new blank page

* rename NewWindowPage -> ReuseWindowConcurrency, move to windowconcur.js
potential fix for #156

* browser repair:
- when using window-concurrency, attempt to repair / relaunch browser if cdp errors occur
- mark pages as failed and don't reuse if page error or cdp errors occur
- screencaster: clear previous targets if screencasting when repairing browser

* bump version to 0.7.0-beta.3
This commit is contained in:
Ilya Kreymer 2022-08-19 09:23:40 -07:00 committed by GitHub
parent 827c153679
commit 6cc38bf511
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 163 additions and 88 deletions

View file

@ -389,6 +389,7 @@ class Crawler {
} catch (e) {
console.warn(e);
await this.markPageFailed(page);
}
}
@ -519,6 +520,10 @@ class Crawler {
this.screencaster = this.initScreenCaster();
if (this.cluster.browser.setScreencaster) {
this.cluster.browser.setScreencaster(this.screencaster);
}
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
const seed = this.params.scopedSeeds[i];
if (!await this.queueUrl(i, seed.url, 0, 0)) {
@ -700,6 +705,9 @@ class Crawler {
ignoreAbort = shouldIgnoreAbort(req);
});
// more serious page error, mark page session as invalid
page.on("error", () => this.markPageFailed(page));
const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";
try {
@ -711,7 +719,7 @@ class Crawler {
} catch (e) {
let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
this.statusLog(`ERROR: ${url}: ${msg}`);
this.statusLog(`Load Error: ${url}: ${msg}`);
this.errorCount++;
}
}
@ -739,6 +747,14 @@ class Crawler {
}
}
async markPageFailed(page) {
page.__failed = true;
this.errorCount++;
if (this.screencaster) {
await this.screencaster.endTarget(page.target());
}
}
async netIdle(page) {
if (!this.params.netIdleWait) {
return;

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.7.0-beta.2",
"version": "0.7.0-beta.3",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
@ -17,7 +17,7 @@
"minio": "7.0.26",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "16.1.0",
"puppeteer-core": "^16.1.1",
"request": "^2.88.2",
"sitemapper": "^3.1.2",
"uuid": "8.3.2",

View file

@ -8,7 +8,7 @@ const { Cluster } = require("puppeteer-cluster");
const yargs = require("yargs/yargs");
const { hideBin } = require("yargs/helpers");
const { NewWindowPage} = require("./screencaster");
const { ReuseWindowConcurrency } = require("./windowconcur");
const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
const { ScopedSeed } = require("./seeds");
const { interpolateFilename } = require("./storage");
@ -374,7 +374,7 @@ class ArgParser {
argv.newContext = Cluster.CONCURRENCY_PAGE;
if (argv.screencastPort && argv.workers > 1) {
console.log("Note: to support screencasting with >1 workers, newContext set to 'window' instead of 'page'");
argv.newContext = NewWindowPage;
argv.newContext = ReuseWindowConcurrency;
}
break;
@ -387,7 +387,7 @@ class ArgParser {
break;
case "window":
argv.newContext = NewWindowPage;
argv.newContext = ReuseWindowConcurrency;
break;
default:

View file

@ -6,9 +6,6 @@ const path = require("path");
const { initRedis } = require("./redis");
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
const indexHTML = fs.readFileSync(path.join(__dirname, "..", "html", "screencast.html"), {encoding: "utf8"});
@ -231,27 +228,36 @@ class ScreenCaster
}
}
async endTarget(target) {
const id = target._targetId;
const cdp = this.targets.get(id);
if (!cdp) {
return;
async endAllTargets() {
const targetIds = this.targets.keys();
for (const key of targetIds) {
await this.endTargetById(key);
}
}
await this.stopCast(cdp);
async endTarget(target) {
await this.endTargetById(target._targetId);
}
async endTargetById(id) {
this.caches.delete(id);
this.urls.delete(id);
const cdp = this.targets.get(id);
if (cdp) {
try {
await this.stopCast(cdp);
await cdp.detach();
} catch (e) {
// already detached
}
}
await this.transport.sendAll({msg: "close", id});
this.targets.delete(id);
try {
await cdp.detach();
} catch (e) {
// already detached
}
}
async startCast(cdp) {
@ -298,66 +304,4 @@ class ScreenCaster
}
}
// ===========================================================================
class NewWindowPage extends SingleBrowserImplementation {
async init() {
await super.init();
this.newTargets = [];
this.nextPromise();
this.mainPage = await this.browser.newPage();
this.pages = [];
this.reuse = true;
await this.mainPage.goto("about:blank");
this.mainTarget = this.mainPage.target();
this.browser.on("targetcreated", (target) => {
if (this._nextTarget && target.opener() === this.mainTarget) {
this.newTargets.push(target);
this._nextTarget();
this.nextPromise();
}
});
}
nextPromise() {
this._nextPromise = new Promise((resolve) => this._nextTarget = resolve);
}
async getNewPage() {
const p = this._nextPromise;
await this.mainPage.evaluate("window.open('about:blank', '', 'resizable');");
await p;
const target = this.newTargets.shift();
return {page: await target.page() };
}
async createResources() {
if (this.pages.length) {
return {page: this.pages.shift()};
}
return await this.getNewPage();
}
async freeResources(resources) {
if (this.reuse) {
this.pages.push(resources.page);
} else {
await resources.page.close();
}
}
}
module.exports = { ScreenCaster, NewWindowPage, WSTransport, RedisPubSubTransport };
module.exports = { ScreenCaster, WSTransport, RedisPubSubTransport };

115
util/windowconcur.js Normal file
View file

@ -0,0 +1,115 @@
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
// ===========================================================================
class ReuseWindowConcurrency extends SingleBrowserImplementation {
async init() {
await super.init();
this.pendingTargets = new Map();
this.startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
this.pages = [];
this.reuseCount = 25;
this.screencaster = null;
const mainTarget = this.browser.target();
this.cdp = await mainTarget.createCDPSession();
this.sessionId = this.cdp.id();
this.browser.on("targetcreated", (target) => {
if (target.url() === this.startPage) {
this.pendingTargets.set(target._targetId, target);
}
});
}
setScreencaster(screencaster) {
this.screencaster = screencaster;
}
async repair() {
if (this.openInstances !== 0 || this.repairing) {
// already repairing or there are still pages open? wait for start/finish
await new Promise(resolve => this.waitingForRepairResolvers.push(resolve));
return;
}
this.repairing = true;
console.debug("Starting repair");
if (this.screencaster) {
this.screencaster.endAllTargets();
}
try {
// will probably fail, but just in case the repair was not necessary
await this.browser.close();
} catch (e) {
console.debug("Unable to close browser.");
}
try {
await this.init();
} catch (err) {
console.debug("Unable to restart chrome.");
}
this.repairRequested = false;
this.repairing = false;
this.waitingForRepairResolvers.forEach(resolve => resolve());
this.waitingForRepairResolvers = [];
}
async getNewPage() {
while (true) {
let targetId;
try {
const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true});
targetId = res.targetId;
} catch (e) {
console.warn(e);
await this.repair();
}
const target = this.pendingTargets.get(targetId);
// this shouldn't really happen, but just in case somehow ended up w/o a target, try again
if (!target) {
continue;
}
this.pendingTargets.delete(targetId);
return {page: await target.page(), count: 0, id: this.sessionId};
}
}
async createResources() {
if (this.pages.length) {
const res = this.pages.shift();
if (res.id === this.sessionId) {
return res;
} else {
// page is using stale session (eg. from crashed/previous browser instance), don't attempt to reuse
}
}
return await this.getNewPage();
}
async freeResources(resources) {
// if marked as failed, don't try to reuse
if (resources.page.__failed) {
await resources.page.close();
}
if (++resources.count > this.reuseCount) {
await resources.page.close();
} else {
this.pages.push(resources);
}
}
}
module.exports = { ReuseWindowConcurrency };

View file

@ -4246,10 +4246,10 @@ punycode@^2.1.0, punycode@^2.1.1:
dependencies:
debug "^4.1.1"
puppeteer-core@^16.1.0:
version "16.1.0"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4"
integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA==
puppeteer-core@^16.1.1:
version "16.1.1"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.1.tgz#2c26c560934a1c524a767c9ec0818520b7adb22a"
integrity sha512-ls+A6t+cbeNtsNIEyWkGoVJRHseEvBhS3NlI2DBFaJNBUG6kUfmAVyColu1ubgy4VuWLKpGUcwrPTVIvNd1Dew==
dependencies:
cross-fetch "3.1.5"
debug "4.3.4"