From 549d65517332fdb2f0f7a33e67e6a3dbb00b9de2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 8 Jul 2025 13:08:52 -0700 Subject: [PATCH] Support option to fail crawl on content check (#861) - add --failOnContentCheck for quick fail if content check in behavior fails - expose __bx_contentCheckFailed to cause an immediately failure from behavior - only allow failing crawl due to content check from within awaitPageLoad() callback - set a 'failReason' key to track that crawl failed due to a particular content check reason - deps: update to browsertrix-behaviors 0.9.0, update to wabac.js (2.23.6) - fixes #860 --------- Co-authored-by: Tessa Walsh --- package.json | 4 ++-- src/crawler.ts | 33 ++++++++++++++++++++++++++++++--- src/util/argParser.ts | 7 +++++++ src/util/constants.ts | 2 ++ src/util/reqresp.ts | 10 ++++++---- src/util/state.ts | 14 ++++++++++++++ tsconfig.json | 2 +- yarn.lock | 16 ++++++++-------- 8 files changed, 70 insertions(+), 18 deletions(-) diff --git a/package.json b/package.json index eee2db03..19c905fb 100644 --- a/package.json +++ b/package.json @@ -18,8 +18,8 @@ "dependencies": { "@novnc/novnc": "1.4.0", "@puppeteer/replay": "^3.1.1", - "@webrecorder/wabac": "^2.23.3", - "browsertrix-behaviors": "^0.8.5", + "@webrecorder/wabac": "^2.23.6", + "browsertrix-behaviors": "^0.9.0", "client-zip": "^2.4.5", "css-selector-parser": "^3.0.5", "fetch-socks": "^1.3.0", diff --git a/src/crawler.ts b/src/crawler.ts index 448f7f9b..8e9c0282 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -645,6 +645,11 @@ export class Crawler { } } } + if (await this.crawlState.isFailed()) { + logger.error("Crawl failed, no pages crawled successfully"); + status = "failed"; + exitCode = ExitCodes.Failed; + } } catch (e) { logger.error("Crawl failed", e); exitCode = ExitCodes.Failed; @@ -938,7 +943,24 @@ self.__bx_behaviors.selectMainBehavior(); return nextFlowStep(id, page, workerid); }); - // await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data)); + if (this.params.failOnContentCheck) { + await page.exposeFunction( + BxFunctionBindings.ContentCheckFailed, + (reason: string) => { + // if called outside of awaitPageLoad(), ignore + if (!opts.data.contentCheckAllowed) { + return; + } + void this.crawlState.setFailReason(reason); + logger.fatal( + "Content check failed, failing crawl", + { reason }, + "behavior", + ExitCodes.Failed, + ); + }, + ); + } } async setupExecContextEvents( @@ -1299,7 +1321,7 @@ self.__bx_behaviors.selectMainBehavior(); "Seed Page Load Failed, failing crawl", {}, "general", - 1, + ExitCodes.GenericError, ); } } @@ -2236,8 +2258,13 @@ self.__bx_behaviors.selectMainBehavior(); await this.netIdle(page, logDetails); + // allow failing crawl via script only within awaitPageLoad() for now + data.contentCheckAllowed = true; + await this.awaitPageLoad(page.mainFrame(), logDetails); + data.contentCheckAllowed = false; + // skip extraction if at max depth if (seed.isAtMaxDepth(depth, extraHops)) { logger.debug("Skipping Link Extraction, At Max Depth", {}, "links"); @@ -2278,7 +2305,7 @@ self.__bx_behaviors.selectMainBehavior(); frame.evaluate( "self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();", ), - PAGE_OP_TIMEOUT_SECS, + PAGE_OP_TIMEOUT_SECS * 4, "Custom page load check timed out", logDetails, ); diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 21b2db88..95ee3e1a 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -587,6 +587,13 @@ class ArgParser { default: false, }, + failOnContentCheck: { + describe: + "If set, allows for behaviors to fail a crawl with custom reason based on content (e.g. logged out)", + type: "boolean", + default: false, + }, + customBehaviors: { describe: "Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" + diff --git a/src/util/constants.ts b/src/util/constants.ts index d6185d4e..866678fc 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -30,6 +30,8 @@ export enum BxFunctionBindings { InitFlow = "__bx_initFlow", NextFlowStep = "__bx_nextFlowStep", + + ContentCheckFailed = "__bx_contentCheckFailed", } export const MAX_DEPTH = 1000000; diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index f9f17659..dba22a5e 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -1,4 +1,8 @@ -import { getCustomRewriter, getStatusText } from "@webrecorder/wabac"; +import { + getCustomRewriter, + getStatusText, + ExtraOpts, +} from "@webrecorder/wabac"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; @@ -66,9 +70,7 @@ export class RequestResponseInfo { resourceType?: string; - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - extraOpts: Record = {}; + extraOpts: ExtraOpts = {}; // stats readSize: number = 0; diff --git a/src/util/state.ts b/src/util/state.ts index 107683af..317c96ff 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -87,6 +87,7 @@ export class PageState { pageSkipped = false; filteredFrames: Frame[] = []; loadState: LoadState = LoadState.FAILED; + contentCheckAllowed = false; logDetails = {}; @@ -447,6 +448,15 @@ return inx; return (await this.queueSize()) == 0 && (await this.numDone()) > 0; } + async isFailed() { + return ( + (await this.numDone()) === 0 && + (await this.queueSize()) === 0 && + (await this.numPending()) === 0 && + (await this.numFailed()) > 0 + ); + } + async trimToLimit(limit: number) { const totalComplete = (await this.numPending()) + @@ -465,6 +475,10 @@ return inx; } } + async setFailReason(reason: string) { + await this.redis.set(`${this.key}:failReason`, reason); + } + async setStatus(status_: string) { await this.redis.hset(`${this.key}:status`, this.uid, status_); } diff --git a/tsconfig.json b/tsconfig.json index e913ef61..61251e38 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -101,7 +101,7 @@ // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ /* Completeness */ - "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + //"skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ "skipLibCheck": true /* Skip type checking all .d.ts files. */ }, diff --git a/yarn.lock b/yarn.lock index 4cb40b07..df313373 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1134,10 +1134,10 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.23.3": - version "2.23.3" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.3.tgz#405f53649183c54fd116e334eae2666d6514a341" - integrity sha512-NlPNGNmilNf/NEqHbCNPcib4GNnZKQJKK3PIiI0BvEdem/TEjvcn5wEBbUntTYn+VwrhX36QY2HC7Iag+dVnvw== +"@webrecorder/wabac@^2.23.6": + version "2.23.6" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.6.tgz#cfcf9ef071732de0b71d49b0d3276711e88788a7" + integrity sha512-eyRew3ddm0PzzD81racFf1REwTQeoMHAQLreG5+B5OBWBdfFblzlbsUTp4KqiFKHMZ14WXjIxznFcYwCUpkA6w== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" @@ -1595,10 +1595,10 @@ browserslist@^4.24.0: node-releases "^2.0.18" update-browserslist-db "^1.1.1" -browsertrix-behaviors@^0.8.5: - version "0.8.5" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.5.tgz#f93dc6fed15cb2266664c85eec7f0796c1634fa5" - integrity sha512-v6wv6NLJEhj3NbrmGEfOWyXf2TuJgj95Em+KfCTPRJxakTtsvH/A7n2FSNvqMhwusqrjpIR4ch6cEkDp4hblvQ== +browsertrix-behaviors@^0.9.0: + version "0.9.0" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.0.tgz#3789a07fdf43ca76b4cb4794119d082189338c6a" + integrity sha512-rfpgW7r9ApwwH20IbpQrJaWupsfbVgxQRRuARs4m8nzIdF/WKTv38fTHDbYci8wJulcdu8D/eAlzyIBPwhrkkA== dependencies: query-selector-shadow-dom "^1.0.1"