mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Page-reuse concurrency + Browser Repair + Screencaster Cleanup Improvements (#157)
* new window: use cdp instead of window.open * new window tweaks: add reuseCount, use browser.target() instead of opening a new blank page * rename NewWindowPage -> ReuseWindowConcurrency, move to windowconcur.js potential fix for #156 * browser repair: - when using window-concurrency, attempt to repair / relaunch browser if cdp errors occur - mark pages as failed and don't reuse if page error or cdp errors occur - screencaster: clear previous targets if screencasting when repairing browser * bump version to 0.7.0-beta.3
This commit is contained in:
parent
827c153679
commit
6cc38bf511
6 changed files with 163 additions and 88 deletions
18
crawler.js
18
crawler.js
|
@ -389,6 +389,7 @@ class Crawler {
|
|||
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
await this.markPageFailed(page);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -519,6 +520,10 @@ class Crawler {
|
|||
|
||||
this.screencaster = this.initScreenCaster();
|
||||
|
||||
if (this.cluster.browser.setScreencaster) {
|
||||
this.cluster.browser.setScreencaster(this.screencaster);
|
||||
}
|
||||
|
||||
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
||||
const seed = this.params.scopedSeeds[i];
|
||||
if (!await this.queueUrl(i, seed.url, 0, 0)) {
|
||||
|
@ -700,6 +705,9 @@ class Crawler {
|
|||
ignoreAbort = shouldIgnoreAbort(req);
|
||||
});
|
||||
|
||||
// more serious page error, mark page session as invalid
|
||||
page.on("error", () => this.markPageFailed(page));
|
||||
|
||||
const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";
|
||||
|
||||
try {
|
||||
|
@ -711,7 +719,7 @@ class Crawler {
|
|||
} catch (e) {
|
||||
let msg = e.message || "";
|
||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||
this.statusLog(`ERROR: ${url}: ${msg}`);
|
||||
this.statusLog(`Load Error: ${url}: ${msg}`);
|
||||
this.errorCount++;
|
||||
}
|
||||
}
|
||||
|
@ -739,6 +747,14 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async markPageFailed(page) {
|
||||
page.__failed = true;
|
||||
this.errorCount++;
|
||||
if (this.screencaster) {
|
||||
await this.screencaster.endTarget(page.target());
|
||||
}
|
||||
}
|
||||
|
||||
async netIdle(page) {
|
||||
if (!this.params.netIdleWait) {
|
||||
return;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.7.0-beta.2",
|
||||
"version": "0.7.0-beta.3",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
@ -17,7 +17,7 @@
|
|||
"minio": "7.0.26",
|
||||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
||||
"puppeteer-core": "16.1.0",
|
||||
"puppeteer-core": "^16.1.1",
|
||||
"request": "^2.88.2",
|
||||
"sitemapper": "^3.1.2",
|
||||
"uuid": "8.3.2",
|
||||
|
|
|
@ -8,7 +8,7 @@ const { Cluster } = require("puppeteer-cluster");
|
|||
const yargs = require("yargs/yargs");
|
||||
const { hideBin } = require("yargs/helpers");
|
||||
|
||||
const { NewWindowPage} = require("./screencaster");
|
||||
const { ReuseWindowConcurrency } = require("./windowconcur");
|
||||
const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
|
||||
const { ScopedSeed } = require("./seeds");
|
||||
const { interpolateFilename } = require("./storage");
|
||||
|
@ -374,7 +374,7 @@ class ArgParser {
|
|||
argv.newContext = Cluster.CONCURRENCY_PAGE;
|
||||
if (argv.screencastPort && argv.workers > 1) {
|
||||
console.log("Note: to support screencasting with >1 workers, newContext set to 'window' instead of 'page'");
|
||||
argv.newContext = NewWindowPage;
|
||||
argv.newContext = ReuseWindowConcurrency;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -387,7 +387,7 @@ class ArgParser {
|
|||
break;
|
||||
|
||||
case "window":
|
||||
argv.newContext = NewWindowPage;
|
||||
argv.newContext = ReuseWindowConcurrency;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
|
|
@ -6,9 +6,6 @@ const path = require("path");
|
|||
|
||||
const { initRedis } = require("./redis");
|
||||
|
||||
|
||||
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
|
||||
|
||||
const indexHTML = fs.readFileSync(path.join(__dirname, "..", "html", "screencast.html"), {encoding: "utf8"});
|
||||
|
||||
|
||||
|
@ -231,29 +228,38 @@ class ScreenCaster
|
|||
}
|
||||
}
|
||||
|
||||
async endTarget(target) {
|
||||
const id = target._targetId;
|
||||
const cdp = this.targets.get(id);
|
||||
if (!cdp) {
|
||||
return;
|
||||
async endAllTargets() {
|
||||
const targetIds = this.targets.keys();
|
||||
|
||||
for (const key of targetIds) {
|
||||
await this.endTargetById(key);
|
||||
}
|
||||
}
|
||||
|
||||
await this.stopCast(cdp);
|
||||
async endTarget(target) {
|
||||
await this.endTargetById(target._targetId);
|
||||
}
|
||||
|
||||
async endTargetById(id) {
|
||||
this.caches.delete(id);
|
||||
this.urls.delete(id);
|
||||
|
||||
await this.transport.sendAll({msg: "close", id});
|
||||
|
||||
this.targets.delete(id);
|
||||
const cdp = this.targets.get(id);
|
||||
|
||||
if (cdp) {
|
||||
try {
|
||||
await this.stopCast(cdp);
|
||||
await cdp.detach();
|
||||
} catch (e) {
|
||||
// already detached
|
||||
}
|
||||
}
|
||||
|
||||
await this.transport.sendAll({msg: "close", id});
|
||||
|
||||
this.targets.delete(id);
|
||||
}
|
||||
|
||||
async startCast(cdp) {
|
||||
if (cdp._startedCast) {
|
||||
return;
|
||||
|
@ -298,66 +304,4 @@ class ScreenCaster
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class NewWindowPage extends SingleBrowserImplementation {
|
||||
async init() {
|
||||
await super.init();
|
||||
|
||||
this.newTargets = [];
|
||||
|
||||
this.nextPromise();
|
||||
|
||||
this.mainPage = await this.browser.newPage();
|
||||
|
||||
this.pages = [];
|
||||
this.reuse = true;
|
||||
|
||||
await this.mainPage.goto("about:blank");
|
||||
|
||||
this.mainTarget = this.mainPage.target();
|
||||
|
||||
this.browser.on("targetcreated", (target) => {
|
||||
if (this._nextTarget && target.opener() === this.mainTarget) {
|
||||
this.newTargets.push(target);
|
||||
this._nextTarget();
|
||||
this.nextPromise();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
nextPromise() {
|
||||
this._nextPromise = new Promise((resolve) => this._nextTarget = resolve);
|
||||
}
|
||||
|
||||
async getNewPage() {
|
||||
const p = this._nextPromise;
|
||||
|
||||
await this.mainPage.evaluate("window.open('about:blank', '', 'resizable');");
|
||||
|
||||
await p;
|
||||
|
||||
const target = this.newTargets.shift();
|
||||
|
||||
return {page: await target.page() };
|
||||
}
|
||||
|
||||
async createResources() {
|
||||
if (this.pages.length) {
|
||||
return {page: this.pages.shift()};
|
||||
}
|
||||
return await this.getNewPage();
|
||||
}
|
||||
|
||||
async freeResources(resources) {
|
||||
if (this.reuse) {
|
||||
this.pages.push(resources.page);
|
||||
} else {
|
||||
await resources.page.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
module.exports = { ScreenCaster, NewWindowPage, WSTransport, RedisPubSubTransport };
|
||||
module.exports = { ScreenCaster, WSTransport, RedisPubSubTransport };
|
||||
|
|
115
util/windowconcur.js
Normal file
115
util/windowconcur.js
Normal file
|
@ -0,0 +1,115 @@
|
|||
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class ReuseWindowConcurrency extends SingleBrowserImplementation {
|
||||
async init() {
|
||||
await super.init();
|
||||
|
||||
this.pendingTargets = new Map();
|
||||
this.startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||
|
||||
this.pages = [];
|
||||
this.reuseCount = 25;
|
||||
|
||||
this.screencaster = null;
|
||||
|
||||
const mainTarget = this.browser.target();
|
||||
|
||||
this.cdp = await mainTarget.createCDPSession();
|
||||
this.sessionId = this.cdp.id();
|
||||
|
||||
this.browser.on("targetcreated", (target) => {
|
||||
if (target.url() === this.startPage) {
|
||||
this.pendingTargets.set(target._targetId, target);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
setScreencaster(screencaster) {
|
||||
this.screencaster = screencaster;
|
||||
}
|
||||
|
||||
async repair() {
|
||||
if (this.openInstances !== 0 || this.repairing) {
|
||||
// already repairing or there are still pages open? wait for start/finish
|
||||
await new Promise(resolve => this.waitingForRepairResolvers.push(resolve));
|
||||
return;
|
||||
}
|
||||
|
||||
this.repairing = true;
|
||||
console.debug("Starting repair");
|
||||
|
||||
if (this.screencaster) {
|
||||
this.screencaster.endAllTargets();
|
||||
}
|
||||
|
||||
try {
|
||||
// will probably fail, but just in case the repair was not necessary
|
||||
await this.browser.close();
|
||||
} catch (e) {
|
||||
console.debug("Unable to close browser.");
|
||||
}
|
||||
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.debug("Unable to restart chrome.");
|
||||
}
|
||||
this.repairRequested = false;
|
||||
this.repairing = false;
|
||||
this.waitingForRepairResolvers.forEach(resolve => resolve());
|
||||
this.waitingForRepairResolvers = [];
|
||||
}
|
||||
|
||||
async getNewPage() {
|
||||
while (true) {
|
||||
let targetId;
|
||||
try {
|
||||
const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true});
|
||||
targetId = res.targetId;
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
await this.repair();
|
||||
}
|
||||
|
||||
const target = this.pendingTargets.get(targetId);
|
||||
// this shouldn't really happen, but just in case somehow ended up w/o a target, try again
|
||||
if (!target) {
|
||||
continue;
|
||||
}
|
||||
|
||||
this.pendingTargets.delete(targetId);
|
||||
|
||||
return {page: await target.page(), count: 0, id: this.sessionId};
|
||||
}
|
||||
}
|
||||
|
||||
async createResources() {
|
||||
if (this.pages.length) {
|
||||
const res = this.pages.shift();
|
||||
if (res.id === this.sessionId) {
|
||||
return res;
|
||||
} else {
|
||||
// page is using stale session (eg. from crashed/previous browser instance), don't attempt to reuse
|
||||
}
|
||||
}
|
||||
return await this.getNewPage();
|
||||
}
|
||||
|
||||
async freeResources(resources) {
|
||||
// if marked as failed, don't try to reuse
|
||||
if (resources.page.__failed) {
|
||||
await resources.page.close();
|
||||
}
|
||||
if (++resources.count > this.reuseCount) {
|
||||
await resources.page.close();
|
||||
} else {
|
||||
this.pages.push(resources);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { ReuseWindowConcurrency };
|
||||
|
||||
|
|
@ -4246,10 +4246,10 @@ punycode@^2.1.0, punycode@^2.1.1:
|
|||
dependencies:
|
||||
debug "^4.1.1"
|
||||
|
||||
puppeteer-core@^16.1.0:
|
||||
version "16.1.0"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4"
|
||||
integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA==
|
||||
puppeteer-core@^16.1.1:
|
||||
version "16.1.1"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.1.tgz#2c26c560934a1c524a767c9ec0818520b7adb22a"
|
||||
integrity sha512-ls+A6t+cbeNtsNIEyWkGoVJRHseEvBhS3NlI2DBFaJNBUG6kUfmAVyColu1ubgy4VuWLKpGUcwrPTVIvNd1Dew==
|
||||
dependencies:
|
||||
cross-fetch "3.1.5"
|
||||
debug "4.3.4"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue