behavior logging tweaks, add netIdle (#381)

* behavior logging tweaks, add netIdle
* fix shouldIncludeFrame() check: was actually erroring out and never accepting any iframes!
now used not only for link extraction but also to run() behaviors
* add logging if iframe check fails
* Dockerfile: add commented out line to use local behaviors.js
* bump behaviors to 0.5.2
This commit is contained in:
Ilya Kreymer 2023-09-14 19:48:41 -07:00 committed by GitHub
parent d72443ced3
commit 3c9be514d3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 45 additions and 18 deletions

View file

@ -1 +1,2 @@
.* .*
behaviors.js

View file

@ -47,6 +47,9 @@ RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/b
WORKDIR /crawls WORKDIR /crawls
# enable to test custom behaviors build (from browsertrix-behaviors)
# COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js
ADD docker-entrypoint.sh /docker-entrypoint.sh ADD docker-entrypoint.sh /docker-entrypoint.sh
ENTRYPOINT ["/docker-entrypoint.sh"] ENTRYPOINT ["/docker-entrypoint.sh"]

View file

@ -516,8 +516,9 @@ self.__bx_behaviors.selectMainBehavior();
"behavior" "behavior"
); );
if (res && res.length) { await this.netIdle(page, logDetails);
logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
if (res) {
data.loadState = LoadState.BEHAVIORS_DONE; data.loadState = LoadState.BEHAVIORS_DONE;
} }
} }
@ -580,13 +581,22 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior"); logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior");
return await Promise.allSettled( const results = await Promise.allSettled(
frames.map(frame => this.browser.evaluateWithCLI(page, frame, cdp, "self.__bx_behaviors.run();", logDetails, "behavior")) frames.map(frame => this.browser.evaluateWithCLI(page, frame, cdp, "self.__bx_behaviors.run();", logDetails, "behavior"))
); );
for (const {status, reason} in results) {
if (status === "rejected") {
logger.warn("Behavior run partially failed", {reason, ...logDetails}, "behavior");
}
}
logger.info("Behaviors finished", {finished: results.length, ...logDetails}, "behavior");
return true;
} catch (e) { } catch (e) {
logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior"); logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior");
return null; return false;
} }
} }
@ -597,11 +607,11 @@ self.__bx_behaviors.selectMainBehavior();
const frameUrl = frame.url(); const frameUrl = frame.url();
const frameElem = await frame.frameElement(); // this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
// if there's no tag or an iframe tag, then assume its a regular frame
const tagName = await frame.evaluate("self && self.frameElement && self.frameElement.tagName");
const tagName = await frame.evaluate(e => e.tagName, frameElem); if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
if (tagName !== "IFRAME" && tagName !== "FRAME") {
logger.debug("Skipping processing non-frame object", {tagName, frameUrl, ...logDetails}, "behavior"); logger.debug("Skipping processing non-frame object", {tagName, frameUrl, ...logDetails}, "behavior");
return null; return null;
} }
@ -1162,7 +1172,13 @@ self.__bx_behaviors.selectMainBehavior();
let frames = await page.frames(); let frames = await page.frames();
frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails))); frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)));
data.filteredFrames = frames.filter((x) => x.status === "fulfilled" && x.value).map(x => x.value); data.filteredFrames = frames.filter((x) => {
if (x.status === "fulfilled" && x.value) {
return true;
}
logger.warn("Error in iframe check", {reason: x.reason, ...logDetails});
return false;
}).map(x => x.value);
//data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails)); //data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
} else { } else {

View file

@ -13,7 +13,7 @@
}, },
"dependencies": { "dependencies": {
"@novnc/novnc": "^1.4.0", "@novnc/novnc": "^1.4.0",
"browsertrix-behaviors": "^0.5.1", "browsertrix-behaviors": "^0.5.2",
"get-folder-size": "^4.0.0", "get-folder-size": "^4.0.0",
"husky": "^8.0.3", "husky": "^8.0.3",
"ioredis": "^4.27.1", "ioredis": "^4.27.1",

View file

@ -1147,10 +1147,10 @@ browserslist@^4.21.3:
node-releases "^2.0.6" node-releases "^2.0.6"
update-browserslist-db "^1.0.9" update-browserslist-db "^1.0.9"
browsertrix-behaviors@^0.5.1: browsertrix-behaviors@^0.5.2:
version "0.5.1" version "0.5.2"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.1.tgz#c4756b349dcabd23e25f851cec804d92e94eb63b" resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.2.tgz#d2fe1d6ff08815ff0dd68a05fe1a3cdc4bbec8ca"
integrity sha512-cNSSpQyQT73Y5NcBn2PFDkZM2ptxHVVcqxstryvtzZNOW9gGqzJlLPo8tmCBY00JHrMyn5rm8qImbFglcG/DKg== integrity sha512-8nhpnzY8OM1mxQ+mZ+m10dpGgMuhCnKUV5YUlitDpMyEfKlEybUmTz5sroVQH8e//NcJox7W6QYjaU2Y/ygxww==
bser@2.1.1: bser@2.1.1:
version "2.1.1" version "2.1.1"
@ -2442,6 +2442,11 @@ is-glob@^4.0.0, is-glob@^4.0.1:
dependencies: dependencies:
is-extglob "^2.1.1" is-extglob "^2.1.1"
is-gzip@2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/is-gzip/-/is-gzip-2.0.0.tgz#f4fed2bbd9f96bf2cb39e19262797fdb15aad933"
integrity sha512-jtO4Njg6q58zDo/Pu4027beSZ0VdsZlt8/5Moco6yAg+DIxb5BK/xUYqYG2+MD4+piKldXJNHxRkhEYI2fvrxA==
is-negative-zero@^2.0.1: is-negative-zero@^2.0.1:
version "2.0.1" version "2.0.1"
resolved "https://registry.yarnpkg.com/is-negative-zero/-/is-negative-zero-2.0.1.tgz#3de746c18dda2319241a53675908d8f766f11c24" resolved "https://registry.yarnpkg.com/is-negative-zero/-/is-negative-zero-2.0.1.tgz#3de746c18dda2319241a53675908d8f766f11c24"
@ -3990,12 +3995,14 @@ sisteransi@^1.0.5:
resolved "https://registry.yarnpkg.com/sisteransi/-/sisteransi-1.0.5.tgz#134d681297756437cc05ca01370d3a7a571075ed" resolved "https://registry.yarnpkg.com/sisteransi/-/sisteransi-1.0.5.tgz#134d681297756437cc05ca01370d3a7a571075ed"
integrity sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg== integrity sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==
sitemapper@^3.1.2: sitemapper@^3.2.5:
version "3.1.12" version "3.2.6"
resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-3.1.12.tgz#58f6cb22112da4d73498c3d14c926c3a1c0d74ee" resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-3.2.6.tgz#892ebdade9a1b0839bd3dee3b67f3d57b10b3a89"
integrity sha512-0BOXAhIfjQll1rrUkkFkpAhYy7MTs887H7Zpc4eAxLPPJJRFXNDdrryTadFWubGMn62bqGr3KdKBKhVdtt/HWg== integrity sha512-AZbim4lmKgchUj6yyJ9ru0eLJ4/S6QAqy5QEbpCpvBbBnXxTERLMC6rzgKy1gHM19YUEtYJFTC2t8lxDWO0wkQ==
dependencies: dependencies:
got "^11.8.0" got "^11.8.0"
is-gzip "2.0.0"
p-limit "^3.1.0"
xml2js "^0.4.23" xml2js "^0.4.23"
slash@^3.0.0: slash@^3.0.0: