Support option to fail crawl on content check (#861)

- add --failOnContentCheck for quick fail if content check in behavior
fails
- expose __bx_contentCheckFailed to cause an immediately failure from
behavior
- only allow failing crawl due to content check from within
awaitPageLoad() callback
- set a 'failReason' key to track that crawl failed due to a particular
content check reason
- deps: update to browsertrix-behaviors 0.9.0, update to wabac.js
(2.23.6)
- fixes #860

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-07-08 13:08:52 -07:00 committed by GitHub
parent 6244515818
commit 549d655173
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 70 additions and 18 deletions

View file

@ -18,8 +18,8 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@puppeteer/replay": "^3.1.1",
"@webrecorder/wabac": "^2.23.3",
"browsertrix-behaviors": "^0.8.5",
"@webrecorder/wabac": "^2.23.6",
"browsertrix-behaviors": "^0.9.0",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",

View file

@ -645,6 +645,11 @@ export class Crawler {
}
}
}
if (await this.crawlState.isFailed()) {
logger.error("Crawl failed, no pages crawled successfully");
status = "failed";
exitCode = ExitCodes.Failed;
}
} catch (e) {
logger.error("Crawl failed", e);
exitCode = ExitCodes.Failed;
@ -938,7 +943,24 @@ self.__bx_behaviors.selectMainBehavior();
return nextFlowStep(id, page, workerid);
});
// await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data));
if (this.params.failOnContentCheck) {
await page.exposeFunction(
BxFunctionBindings.ContentCheckFailed,
(reason: string) => {
// if called outside of awaitPageLoad(), ignore
if (!opts.data.contentCheckAllowed) {
return;
}
void this.crawlState.setFailReason(reason);
logger.fatal(
"Content check failed, failing crawl",
{ reason },
"behavior",
ExitCodes.Failed,
);
},
);
}
}
async setupExecContextEvents(
@ -1299,7 +1321,7 @@ self.__bx_behaviors.selectMainBehavior();
"Seed Page Load Failed, failing crawl",
{},
"general",
1,
ExitCodes.GenericError,
);
}
}
@ -2236,8 +2258,13 @@ self.__bx_behaviors.selectMainBehavior();
await this.netIdle(page, logDetails);
// allow failing crawl via script only within awaitPageLoad() for now
data.contentCheckAllowed = true;
await this.awaitPageLoad(page.mainFrame(), logDetails);
data.contentCheckAllowed = false;
// skip extraction if at max depth
if (seed.isAtMaxDepth(depth, extraHops)) {
logger.debug("Skipping Link Extraction, At Max Depth", {}, "links");
@ -2278,7 +2305,7 @@ self.__bx_behaviors.selectMainBehavior();
frame.evaluate(
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
),
PAGE_OP_TIMEOUT_SECS,
PAGE_OP_TIMEOUT_SECS * 4,
"Custom page load check timed out",
logDetails,
);

View file

@ -587,6 +587,13 @@ class ArgParser {
default: false,
},
failOnContentCheck: {
describe:
"If set, allows for behaviors to fail a crawl with custom reason based on content (e.g. logged out)",
type: "boolean",
default: false,
},
customBehaviors: {
describe:
"Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" +

View file

@ -30,6 +30,8 @@ export enum BxFunctionBindings {
InitFlow = "__bx_initFlow",
NextFlowStep = "__bx_nextFlowStep",
ContentCheckFailed = "__bx_contentCheckFailed",
}
export const MAX_DEPTH = 1000000;

View file

@ -1,4 +1,8 @@
import { getCustomRewriter, getStatusText } from "@webrecorder/wabac";
import {
getCustomRewriter,
getStatusText,
ExtraOpts,
} from "@webrecorder/wabac";
import { Protocol } from "puppeteer-core";
import { postToGetUrl } from "warcio";
@ -66,9 +70,7 @@ export class RequestResponseInfo {
resourceType?: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
extraOpts: Record<string, any> = {};
extraOpts: ExtraOpts = {};
// stats
readSize: number = 0;

View file

@ -87,6 +87,7 @@ export class PageState {
pageSkipped = false;
filteredFrames: Frame[] = [];
loadState: LoadState = LoadState.FAILED;
contentCheckAllowed = false;
logDetails = {};
@ -447,6 +448,15 @@ return inx;
return (await this.queueSize()) == 0 && (await this.numDone()) > 0;
}
async isFailed() {
return (
(await this.numDone()) === 0 &&
(await this.queueSize()) === 0 &&
(await this.numPending()) === 0 &&
(await this.numFailed()) > 0
);
}
async trimToLimit(limit: number) {
const totalComplete =
(await this.numPending()) +
@ -465,6 +475,10 @@ return inx;
}
}
async setFailReason(reason: string) {
await this.redis.set(`${this.key}:failReason`, reason);
}
async setStatus(status_: string) {
await this.redis.hset(`${this.key}:status`, this.uid, status_);
}

View file

@ -101,7 +101,7 @@
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
"skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
//"skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
},

View file

@ -1134,10 +1134,10 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.23.3":
version "2.23.3"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.3.tgz#405f53649183c54fd116e334eae2666d6514a341"
integrity sha512-NlPNGNmilNf/NEqHbCNPcib4GNnZKQJKK3PIiI0BvEdem/TEjvcn5wEBbUntTYn+VwrhX36QY2HC7Iag+dVnvw==
"@webrecorder/wabac@^2.23.6":
version "2.23.6"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.6.tgz#cfcf9ef071732de0b71d49b0d3276711e88788a7"
integrity sha512-eyRew3ddm0PzzD81racFf1REwTQeoMHAQLreG5+B5OBWBdfFblzlbsUTp4KqiFKHMZ14WXjIxznFcYwCUpkA6w==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
@ -1595,10 +1595,10 @@ browserslist@^4.24.0:
node-releases "^2.0.18"
update-browserslist-db "^1.1.1"
browsertrix-behaviors@^0.8.5:
version "0.8.5"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.5.tgz#f93dc6fed15cb2266664c85eec7f0796c1634fa5"
integrity sha512-v6wv6NLJEhj3NbrmGEfOWyXf2TuJgj95Em+KfCTPRJxakTtsvH/A7n2FSNvqMhwusqrjpIR4ch6cEkDp4hblvQ==
browsertrix-behaviors@^0.9.0:
version "0.9.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.0.tgz#3789a07fdf43ca76b4cb4794119d082189338c6a"
integrity sha512-rfpgW7r9ApwwH20IbpQrJaWupsfbVgxQRRuARs4m8nzIdF/WKTv38fTHDbYci8wJulcdu8D/eAlzyIBPwhrkkA==
dependencies:
query-selector-shadow-dom "^1.0.1"