mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
behavior logging tweaks, add netIdle (#381)
* behavior logging tweaks, add netIdle * fix shouldIncludeFrame() check: was actually erroring out and never accepting any iframes! now used not only for link extraction but also to run() behaviors * add logging if iframe check fails * Dockerfile: add commented out line to use local behaviors.js * bump behaviors to 0.5.2
This commit is contained in:
parent
d72443ced3
commit
3c9be514d3
5 changed files with 45 additions and 18 deletions
|
@ -1 +1,2 @@
|
|||
.*
|
||||
behaviors.js
|
||||
|
|
|
@ -47,6 +47,9 @@ RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/b
|
|||
|
||||
WORKDIR /crawls
|
||||
|
||||
# enable to test custom behaviors build (from browsertrix-behaviors)
|
||||
# COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js
|
||||
|
||||
ADD docker-entrypoint.sh /docker-entrypoint.sh
|
||||
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||
|
||||
|
|
34
crawler.js
34
crawler.js
|
@ -516,8 +516,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"behavior"
|
||||
);
|
||||
|
||||
if (res && res.length) {
|
||||
logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
|
||||
await this.netIdle(page, logDetails);
|
||||
|
||||
if (res) {
|
||||
data.loadState = LoadState.BEHAVIORS_DONE;
|
||||
}
|
||||
}
|
||||
|
@ -580,13 +581,22 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior");
|
||||
|
||||
return await Promise.allSettled(
|
||||
const results = await Promise.allSettled(
|
||||
frames.map(frame => this.browser.evaluateWithCLI(page, frame, cdp, "self.__bx_behaviors.run();", logDetails, "behavior"))
|
||||
);
|
||||
|
||||
for (const {status, reason} in results) {
|
||||
if (status === "rejected") {
|
||||
logger.warn("Behavior run partially failed", {reason, ...logDetails}, "behavior");
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Behaviors finished", {finished: results.length, ...logDetails}, "behavior");
|
||||
return true;
|
||||
|
||||
} catch (e) {
|
||||
logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior");
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -597,11 +607,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const frameUrl = frame.url();
|
||||
|
||||
const frameElem = await frame.frameElement();
|
||||
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
|
||||
// if there's no tag or an iframe tag, then assume its a regular frame
|
||||
const tagName = await frame.evaluate("self && self.frameElement && self.frameElement.tagName");
|
||||
|
||||
const tagName = await frame.evaluate(e => e.tagName, frameElem);
|
||||
|
||||
if (tagName !== "IFRAME" && tagName !== "FRAME") {
|
||||
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
|
||||
logger.debug("Skipping processing non-frame object", {tagName, frameUrl, ...logDetails}, "behavior");
|
||||
return null;
|
||||
}
|
||||
|
@ -1162,7 +1172,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
let frames = await page.frames();
|
||||
frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)));
|
||||
|
||||
data.filteredFrames = frames.filter((x) => x.status === "fulfilled" && x.value).map(x => x.value);
|
||||
data.filteredFrames = frames.filter((x) => {
|
||||
if (x.status === "fulfilled" && x.value) {
|
||||
return true;
|
||||
}
|
||||
logger.warn("Error in iframe check", {reason: x.reason, ...logDetails});
|
||||
return false;
|
||||
}).map(x => x.value);
|
||||
|
||||
//data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
|
||||
} else {
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"@novnc/novnc": "^1.4.0",
|
||||
"browsertrix-behaviors": "^0.5.1",
|
||||
"browsertrix-behaviors": "^0.5.2",
|
||||
"get-folder-size": "^4.0.0",
|
||||
"husky": "^8.0.3",
|
||||
"ioredis": "^4.27.1",
|
||||
|
|
23
yarn.lock
23
yarn.lock
|
@ -1147,10 +1147,10 @@ browserslist@^4.21.3:
|
|||
node-releases "^2.0.6"
|
||||
update-browserslist-db "^1.0.9"
|
||||
|
||||
browsertrix-behaviors@^0.5.1:
|
||||
version "0.5.1"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.1.tgz#c4756b349dcabd23e25f851cec804d92e94eb63b"
|
||||
integrity sha512-cNSSpQyQT73Y5NcBn2PFDkZM2ptxHVVcqxstryvtzZNOW9gGqzJlLPo8tmCBY00JHrMyn5rm8qImbFglcG/DKg==
|
||||
browsertrix-behaviors@^0.5.2:
|
||||
version "0.5.2"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.2.tgz#d2fe1d6ff08815ff0dd68a05fe1a3cdc4bbec8ca"
|
||||
integrity sha512-8nhpnzY8OM1mxQ+mZ+m10dpGgMuhCnKUV5YUlitDpMyEfKlEybUmTz5sroVQH8e//NcJox7W6QYjaU2Y/ygxww==
|
||||
|
||||
bser@2.1.1:
|
||||
version "2.1.1"
|
||||
|
@ -2442,6 +2442,11 @@ is-glob@^4.0.0, is-glob@^4.0.1:
|
|||
dependencies:
|
||||
is-extglob "^2.1.1"
|
||||
|
||||
is-gzip@2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/is-gzip/-/is-gzip-2.0.0.tgz#f4fed2bbd9f96bf2cb39e19262797fdb15aad933"
|
||||
integrity sha512-jtO4Njg6q58zDo/Pu4027beSZ0VdsZlt8/5Moco6yAg+DIxb5BK/xUYqYG2+MD4+piKldXJNHxRkhEYI2fvrxA==
|
||||
|
||||
is-negative-zero@^2.0.1:
|
||||
version "2.0.1"
|
||||
resolved "https://registry.yarnpkg.com/is-negative-zero/-/is-negative-zero-2.0.1.tgz#3de746c18dda2319241a53675908d8f766f11c24"
|
||||
|
@ -3990,12 +3995,14 @@ sisteransi@^1.0.5:
|
|||
resolved "https://registry.yarnpkg.com/sisteransi/-/sisteransi-1.0.5.tgz#134d681297756437cc05ca01370d3a7a571075ed"
|
||||
integrity sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==
|
||||
|
||||
sitemapper@^3.1.2:
|
||||
version "3.1.12"
|
||||
resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-3.1.12.tgz#58f6cb22112da4d73498c3d14c926c3a1c0d74ee"
|
||||
integrity sha512-0BOXAhIfjQll1rrUkkFkpAhYy7MTs887H7Zpc4eAxLPPJJRFXNDdrryTadFWubGMn62bqGr3KdKBKhVdtt/HWg==
|
||||
sitemapper@^3.2.5:
|
||||
version "3.2.6"
|
||||
resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-3.2.6.tgz#892ebdade9a1b0839bd3dee3b67f3d57b10b3a89"
|
||||
integrity sha512-AZbim4lmKgchUj6yyJ9ru0eLJ4/S6QAqy5QEbpCpvBbBnXxTERLMC6rzgKy1gHM19YUEtYJFTC2t8lxDWO0wkQ==
|
||||
dependencies:
|
||||
got "^11.8.0"
|
||||
is-gzip "2.0.0"
|
||||
p-limit "^3.1.0"
|
||||
xml2js "^0.4.23"
|
||||
|
||||
slash@^3.0.0:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue