From 3c9be514d3b22bee2cf8800beba10d1762d3aafe Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 14 Sep 2023 19:48:41 -0700 Subject: [PATCH] behavior logging tweaks, add netIdle (#381) * behavior logging tweaks, add netIdle * fix shouldIncludeFrame() check: was actually erroring out and never accepting any iframes! now used not only for link extraction but also to run() behaviors * add logging if iframe check fails * Dockerfile: add commented out line to use local behaviors.js * bump behaviors to 0.5.2 --- .eslintignore | 1 + Dockerfile | 3 +++ crawler.js | 34 +++++++++++++++++++++++++--------- package.json | 2 +- yarn.lock | 23 +++++++++++++++-------- 5 files changed, 45 insertions(+), 18 deletions(-) diff --git a/.eslintignore b/.eslintignore index 8d98f9de..9b020f3b 100644 --- a/.eslintignore +++ b/.eslintignore @@ -1 +1,2 @@ .* +behaviors.js diff --git a/Dockerfile b/Dockerfile index f5ef85b0..8abc7c53 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,6 +47,9 @@ RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/b WORKDIR /crawls +# enable to test custom behaviors build (from browsertrix-behaviors) +# COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js + ADD docker-entrypoint.sh /docker-entrypoint.sh ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/crawler.js b/crawler.js index 2c43a050..8957f851 100644 --- a/crawler.js +++ b/crawler.js @@ -516,8 +516,9 @@ self.__bx_behaviors.selectMainBehavior(); "behavior" ); - if (res && res.length) { - logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior"); + await this.netIdle(page, logDetails); + + if (res) { data.loadState = LoadState.BEHAVIORS_DONE; } } @@ -580,13 +581,22 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior"); - return await Promise.allSettled( + const results = await Promise.allSettled( frames.map(frame => this.browser.evaluateWithCLI(page, frame, cdp, "self.__bx_behaviors.run();", logDetails, "behavior")) ); + for (const {status, reason} in results) { + if (status === "rejected") { + logger.warn("Behavior run partially failed", {reason, ...logDetails}, "behavior"); + } + } + + logger.info("Behaviors finished", {finished: results.length, ...logDetails}, "behavior"); + return true; + } catch (e) { logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior"); - return null; + return false; } } @@ -597,11 +607,11 @@ self.__bx_behaviors.selectMainBehavior(); const frameUrl = frame.url(); - const frameElem = await frame.frameElement(); + // this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs + // if there's no tag or an iframe tag, then assume its a regular frame + const tagName = await frame.evaluate("self && self.frameElement && self.frameElement.tagName"); - const tagName = await frame.evaluate(e => e.tagName, frameElem); - - if (tagName !== "IFRAME" && tagName !== "FRAME") { + if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") { logger.debug("Skipping processing non-frame object", {tagName, frameUrl, ...logDetails}, "behavior"); return null; } @@ -1162,7 +1172,13 @@ self.__bx_behaviors.selectMainBehavior(); let frames = await page.frames(); frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails))); - data.filteredFrames = frames.filter((x) => x.status === "fulfilled" && x.value).map(x => x.value); + data.filteredFrames = frames.filter((x) => { + if (x.status === "fulfilled" && x.value) { + return true; + } + logger.warn("Error in iframe check", {reason: x.reason, ...logDetails}); + return false; + }).map(x => x.value); //data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails)); } else { diff --git a/package.json b/package.json index 1bd75943..6d8d4f52 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,7 @@ }, "dependencies": { "@novnc/novnc": "^1.4.0", - "browsertrix-behaviors": "^0.5.1", + "browsertrix-behaviors": "^0.5.2", "get-folder-size": "^4.0.0", "husky": "^8.0.3", "ioredis": "^4.27.1", diff --git a/yarn.lock b/yarn.lock index 7e79d4c6..ec86c3a0 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1147,10 +1147,10 @@ browserslist@^4.21.3: node-releases "^2.0.6" update-browserslist-db "^1.0.9" -browsertrix-behaviors@^0.5.1: - version "0.5.1" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.1.tgz#c4756b349dcabd23e25f851cec804d92e94eb63b" - integrity sha512-cNSSpQyQT73Y5NcBn2PFDkZM2ptxHVVcqxstryvtzZNOW9gGqzJlLPo8tmCBY00JHrMyn5rm8qImbFglcG/DKg== +browsertrix-behaviors@^0.5.2: + version "0.5.2" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.2.tgz#d2fe1d6ff08815ff0dd68a05fe1a3cdc4bbec8ca" + integrity sha512-8nhpnzY8OM1mxQ+mZ+m10dpGgMuhCnKUV5YUlitDpMyEfKlEybUmTz5sroVQH8e//NcJox7W6QYjaU2Y/ygxww== bser@2.1.1: version "2.1.1" @@ -2442,6 +2442,11 @@ is-glob@^4.0.0, is-glob@^4.0.1: dependencies: is-extglob "^2.1.1" +is-gzip@2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/is-gzip/-/is-gzip-2.0.0.tgz#f4fed2bbd9f96bf2cb39e19262797fdb15aad933" + integrity sha512-jtO4Njg6q58zDo/Pu4027beSZ0VdsZlt8/5Moco6yAg+DIxb5BK/xUYqYG2+MD4+piKldXJNHxRkhEYI2fvrxA== + is-negative-zero@^2.0.1: version "2.0.1" resolved "https://registry.yarnpkg.com/is-negative-zero/-/is-negative-zero-2.0.1.tgz#3de746c18dda2319241a53675908d8f766f11c24" @@ -3990,12 +3995,14 @@ sisteransi@^1.0.5: resolved "https://registry.yarnpkg.com/sisteransi/-/sisteransi-1.0.5.tgz#134d681297756437cc05ca01370d3a7a571075ed" integrity sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg== -sitemapper@^3.1.2: - version "3.1.12" - resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-3.1.12.tgz#58f6cb22112da4d73498c3d14c926c3a1c0d74ee" - integrity sha512-0BOXAhIfjQll1rrUkkFkpAhYy7MTs887H7Zpc4eAxLPPJJRFXNDdrryTadFWubGMn62bqGr3KdKBKhVdtt/HWg== +sitemapper@^3.2.5: + version "3.2.6" + resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-3.2.6.tgz#892ebdade9a1b0839bd3dee3b67f3d57b10b3a89" + integrity sha512-AZbim4lmKgchUj6yyJ9ru0eLJ4/S6QAqy5QEbpCpvBbBnXxTERLMC6rzgKy1gHM19YUEtYJFTC2t8lxDWO0wkQ== dependencies: got "^11.8.0" + is-gzip "2.0.0" + p-limit "^3.1.0" xml2js "^0.4.23" slash@^3.0.0: