Support site-specific wait via browsertrix-behaviors (#555)

The 0.6.0 release of Browsertrix Behaviors /
webrecorder/browsertrix-behaviors#70 introduces support for site-specific behaviors to implement an `awaitPageLoad()` function which allows for waiting for specific resources on the page load.
- This PR just adds a call to this function directly after page load.
- Factors out into an `awaitPageLoad()` method used in both crawler and replaycrawler to support the same wait in QA Mode
- This is to support custom loading wait time for Instagram (other sites in the future)
This commit is contained in:
Ilya Kreymer 2024-04-18 17:16:57 -07:00 committed by GitHub
parent 75b617dc94
commit 51d82598e7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 32 additions and 24 deletions

View file

@ -19,7 +19,7 @@
"@novnc/novnc": "^1.4.0", "@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7", "@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.16.12", "@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.5.3", "browsertrix-behaviors": "^0.6.0",
"crc": "^4.3.2", "crc": "^4.3.2",
"get-folder-size": "^4.0.0", "get-folder-size": "^4.0.0",
"husky": "^8.0.3", "husky": "^8.0.3",

View file

@ -1835,12 +1835,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.netIdle(page, logDetails); await this.netIdle(page, logDetails);
if (this.params.postLoadDelay) { await this.awaitPageLoad(page.mainFrame(), logDetails);
logger.info("Awaiting post load delay", {
seconds: this.params.postLoadDelay,
});
await sleep(this.params.postLoadDelay);
}
// skip extraction if at max depth // skip extraction if at max depth
if (seed.isAtMaxDepth(depth) || !selectorOptsList) { if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
@ -1871,6 +1866,26 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
logger.debug(
"Waiting for custom page load via behavior",
logDetails,
"behavior",
);
try {
await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
} catch (e) {
logger.warn("Waiting for custom page load failed", e, "behavior");
}
if (this.params.postLoadDelay) {
logger.info("Awaiting post load delay", {
seconds: this.params.postLoadDelay,
});
await sleep(this.params.postLoadDelay);
}
}
async extractLinks( async extractLinks(
page: Page, page: Page,
data: PageState, data: PageState,

View file

@ -401,7 +401,9 @@ export class ReplayCrawler extends Crawler {
const timestamp = date.toISOString().slice(0, 19).replace(/[T:-]/g, ""); const timestamp = date.toISOString().slice(0, 19).replace(/[T:-]/g, "");
logger.info("Loading Replay", { url, timestamp }, "replay"); const logDetails = { url, timestamp, id: workerid, pageid };
logger.info("Loading Replay", logDetails, "replay");
const pageInfo = { const pageInfo = {
pageid, pageid,
@ -416,11 +418,7 @@ export class ReplayCrawler extends Crawler {
let replayFrame; let replayFrame;
if (page.frames().length <= SKIP_FRAMES) { if (page.frames().length <= SKIP_FRAMES) {
logger.warn( logger.warn("RWP possibly crashed, reloading page", logDetails, "replay");
"RWP possibly crashed, reloading page",
{ url, timestamp, id: workerid, pageid },
"replay",
);
//throw new Error("logged"); //throw new Error("logged");
replayFrame = await this.awaitRWPLoad(page); replayFrame = await this.awaitRWPLoad(page);
} else { } else {
@ -435,7 +433,7 @@ export class ReplayCrawler extends Crawler {
} catch (e) { } catch (e) {
logger.warn( logger.warn(
"Loading replay timed out", "Loading replay timed out",
{ url, timestamp, id: workerid, pageid, ...formatErr(e) }, { ...logDetails, ...formatErr(e) },
"replay", "replay",
); );
} }
@ -443,12 +441,7 @@ export class ReplayCrawler extends Crawler {
// optionally reload (todo: reevaluate if this is needed) // optionally reload (todo: reevaluate if this is needed)
// await page.reload(); // await page.reload();
if (this.params.postLoadDelay) { await this.awaitPageLoad(replayFrame, logDetails);
logger.info("Awaiting post load delay", {
seconds: this.params.postLoadDelay,
});
await sleep(this.params.postLoadDelay);
}
data.isHTMLPage = true; data.isHTMLPage = true;

View file

@ -1663,10 +1663,10 @@ browserslist@^4.22.2:
node-releases "^2.0.14" node-releases "^2.0.14"
update-browserslist-db "^1.0.13" update-browserslist-db "^1.0.13"
browsertrix-behaviors@^0.5.3: browsertrix-behaviors@^0.6.0:
version "0.5.3" version "0.6.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.3.tgz#f987075790b0fd970814f57195e8525277ddd2a0" resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.0.tgz#e16345e4b414b18e6441548d517d01b4316f744e"
integrity sha512-NiVdV42xvj4DvX/z0Dxqzqsa+5e57/M7hIyK3fl41BxzOJqCgSMu0MpkrWuKpbRVo+89ZnBmzh2z6D18Vmn1LA== integrity sha512-BdfEPHmDjhEIFrn80UKnwGT6HRgnmq2shNybu8BEfAHJQsqZdvP/VVKWvNGnWML1jjUKiwtvtkdFhtHedFQkzA==
bser@2.1.1: bser@2.1.1:
version "2.1.1" version "2.1.1"