mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Support site-specific wait via browsertrix-behaviors (#555)
The 0.6.0 release of Browsertrix Behaviors / webrecorder/browsertrix-behaviors#70 introduces support for site-specific behaviors to implement an `awaitPageLoad()` function which allows for waiting for specific resources on the page load. - This PR just adds a call to this function directly after page load. - Factors out into an `awaitPageLoad()` method used in both crawler and replaycrawler to support the same wait in QA Mode - This is to support custom loading wait time for Instagram (other sites in the future)
This commit is contained in:
parent
75b617dc94
commit
51d82598e7
4 changed files with 32 additions and 24 deletions
|
@ -19,7 +19,7 @@
|
|||
"@novnc/novnc": "^1.4.0",
|
||||
"@types/sax": "^1.2.7",
|
||||
"@webrecorder/wabac": "^2.16.12",
|
||||
"browsertrix-behaviors": "^0.5.3",
|
||||
"browsertrix-behaviors": "^0.6.0",
|
||||
"crc": "^4.3.2",
|
||||
"get-folder-size": "^4.0.0",
|
||||
"husky": "^8.0.3",
|
||||
|
|
|
@ -1835,12 +1835,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
await this.netIdle(page, logDetails);
|
||||
|
||||
if (this.params.postLoadDelay) {
|
||||
logger.info("Awaiting post load delay", {
|
||||
seconds: this.params.postLoadDelay,
|
||||
});
|
||||
await sleep(this.params.postLoadDelay);
|
||||
}
|
||||
await this.awaitPageLoad(page.mainFrame(), logDetails);
|
||||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||
|
@ -1871,6 +1866,26 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
|
||||
logger.debug(
|
||||
"Waiting for custom page load via behavior",
|
||||
logDetails,
|
||||
"behavior",
|
||||
);
|
||||
try {
|
||||
await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
|
||||
} catch (e) {
|
||||
logger.warn("Waiting for custom page load failed", e, "behavior");
|
||||
}
|
||||
|
||||
if (this.params.postLoadDelay) {
|
||||
logger.info("Awaiting post load delay", {
|
||||
seconds: this.params.postLoadDelay,
|
||||
});
|
||||
await sleep(this.params.postLoadDelay);
|
||||
}
|
||||
}
|
||||
|
||||
async extractLinks(
|
||||
page: Page,
|
||||
data: PageState,
|
||||
|
|
|
@ -401,7 +401,9 @@ export class ReplayCrawler extends Crawler {
|
|||
|
||||
const timestamp = date.toISOString().slice(0, 19).replace(/[T:-]/g, "");
|
||||
|
||||
logger.info("Loading Replay", { url, timestamp }, "replay");
|
||||
const logDetails = { url, timestamp, id: workerid, pageid };
|
||||
|
||||
logger.info("Loading Replay", logDetails, "replay");
|
||||
|
||||
const pageInfo = {
|
||||
pageid,
|
||||
|
@ -416,11 +418,7 @@ export class ReplayCrawler extends Crawler {
|
|||
let replayFrame;
|
||||
|
||||
if (page.frames().length <= SKIP_FRAMES) {
|
||||
logger.warn(
|
||||
"RWP possibly crashed, reloading page",
|
||||
{ url, timestamp, id: workerid, pageid },
|
||||
"replay",
|
||||
);
|
||||
logger.warn("RWP possibly crashed, reloading page", logDetails, "replay");
|
||||
//throw new Error("logged");
|
||||
replayFrame = await this.awaitRWPLoad(page);
|
||||
} else {
|
||||
|
@ -435,7 +433,7 @@ export class ReplayCrawler extends Crawler {
|
|||
} catch (e) {
|
||||
logger.warn(
|
||||
"Loading replay timed out",
|
||||
{ url, timestamp, id: workerid, pageid, ...formatErr(e) },
|
||||
{ ...logDetails, ...formatErr(e) },
|
||||
"replay",
|
||||
);
|
||||
}
|
||||
|
@ -443,12 +441,7 @@ export class ReplayCrawler extends Crawler {
|
|||
// optionally reload (todo: reevaluate if this is needed)
|
||||
// await page.reload();
|
||||
|
||||
if (this.params.postLoadDelay) {
|
||||
logger.info("Awaiting post load delay", {
|
||||
seconds: this.params.postLoadDelay,
|
||||
});
|
||||
await sleep(this.params.postLoadDelay);
|
||||
}
|
||||
await this.awaitPageLoad(replayFrame, logDetails);
|
||||
|
||||
data.isHTMLPage = true;
|
||||
|
||||
|
|
|
@ -1663,10 +1663,10 @@ browserslist@^4.22.2:
|
|||
node-releases "^2.0.14"
|
||||
update-browserslist-db "^1.0.13"
|
||||
|
||||
browsertrix-behaviors@^0.5.3:
|
||||
version "0.5.3"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.3.tgz#f987075790b0fd970814f57195e8525277ddd2a0"
|
||||
integrity sha512-NiVdV42xvj4DvX/z0Dxqzqsa+5e57/M7hIyK3fl41BxzOJqCgSMu0MpkrWuKpbRVo+89ZnBmzh2z6D18Vmn1LA==
|
||||
browsertrix-behaviors@^0.6.0:
|
||||
version "0.6.0"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.0.tgz#e16345e4b414b18e6441548d517d01b4316f744e"
|
||||
integrity sha512-BdfEPHmDjhEIFrn80UKnwGT6HRgnmq2shNybu8BEfAHJQsqZdvP/VVKWvNGnWML1jjUKiwtvtkdFhtHedFQkzA==
|
||||
|
||||
bser@2.1.1:
|
||||
version "2.1.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue