diff --git a/package.json b/package.json index 219a21f4..bd61be3d 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,7 @@ "@novnc/novnc": "^1.4.0", "@types/sax": "^1.2.7", "@webrecorder/wabac": "^2.16.12", - "browsertrix-behaviors": "^0.5.3", + "browsertrix-behaviors": "^0.6.0", "crc": "^4.3.2", "get-folder-size": "^4.0.0", "husky": "^8.0.3", diff --git a/src/crawler.ts b/src/crawler.ts index 8bcdc2af..27f28324 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1835,12 +1835,7 @@ self.__bx_behaviors.selectMainBehavior(); await this.netIdle(page, logDetails); - if (this.params.postLoadDelay) { - logger.info("Awaiting post load delay", { - seconds: this.params.postLoadDelay, - }); - await sleep(this.params.postLoadDelay); - } + await this.awaitPageLoad(page.mainFrame(), logDetails); // skip extraction if at max depth if (seed.isAtMaxDepth(depth) || !selectorOptsList) { @@ -1871,6 +1866,26 @@ self.__bx_behaviors.selectMainBehavior(); } } + async awaitPageLoad(frame: Frame, logDetails: LogDetails) { + logger.debug( + "Waiting for custom page load via behavior", + logDetails, + "behavior", + ); + try { + await frame.evaluate("self.__bx_behaviors.awaitPageLoad();"); + } catch (e) { + logger.warn("Waiting for custom page load failed", e, "behavior"); + } + + if (this.params.postLoadDelay) { + logger.info("Awaiting post load delay", { + seconds: this.params.postLoadDelay, + }); + await sleep(this.params.postLoadDelay); + } + } + async extractLinks( page: Page, data: PageState, diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 82d5fda5..ab5b18fc 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -401,7 +401,9 @@ export class ReplayCrawler extends Crawler { const timestamp = date.toISOString().slice(0, 19).replace(/[T:-]/g, ""); - logger.info("Loading Replay", { url, timestamp }, "replay"); + const logDetails = { url, timestamp, id: workerid, pageid }; + + logger.info("Loading Replay", logDetails, "replay"); const pageInfo = { pageid, @@ -416,11 +418,7 @@ export class ReplayCrawler extends Crawler { let replayFrame; if (page.frames().length <= SKIP_FRAMES) { - logger.warn( - "RWP possibly crashed, reloading page", - { url, timestamp, id: workerid, pageid }, - "replay", - ); + logger.warn("RWP possibly crashed, reloading page", logDetails, "replay"); //throw new Error("logged"); replayFrame = await this.awaitRWPLoad(page); } else { @@ -435,7 +433,7 @@ export class ReplayCrawler extends Crawler { } catch (e) { logger.warn( "Loading replay timed out", - { url, timestamp, id: workerid, pageid, ...formatErr(e) }, + { ...logDetails, ...formatErr(e) }, "replay", ); } @@ -443,12 +441,7 @@ export class ReplayCrawler extends Crawler { // optionally reload (todo: reevaluate if this is needed) // await page.reload(); - if (this.params.postLoadDelay) { - logger.info("Awaiting post load delay", { - seconds: this.params.postLoadDelay, - }); - await sleep(this.params.postLoadDelay); - } + await this.awaitPageLoad(replayFrame, logDetails); data.isHTMLPage = true; diff --git a/yarn.lock b/yarn.lock index 276e0893..07506669 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1663,10 +1663,10 @@ browserslist@^4.22.2: node-releases "^2.0.14" update-browserslist-db "^1.0.13" -browsertrix-behaviors@^0.5.3: - version "0.5.3" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.3.tgz#f987075790b0fd970814f57195e8525277ddd2a0" - integrity sha512-NiVdV42xvj4DvX/z0Dxqzqsa+5e57/M7hIyK3fl41BxzOJqCgSMu0MpkrWuKpbRVo+89ZnBmzh2z6D18Vmn1LA== +browsertrix-behaviors@^0.6.0: + version "0.6.0" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.0.tgz#e16345e4b414b18e6441548d517d01b4316f744e" + integrity sha512-BdfEPHmDjhEIFrn80UKnwGT6HRgnmq2shNybu8BEfAHJQsqZdvP/VVKWvNGnWML1jjUKiwtvtkdFhtHedFQkzA== bser@2.1.1: version "2.1.1"