mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Support option to fail crawl on content check (#861)
- add --failOnContentCheck for quick fail if content check in behavior fails - expose __bx_contentCheckFailed to cause an immediately failure from behavior - only allow failing crawl due to content check from within awaitPageLoad() callback - set a 'failReason' key to track that crawl failed due to a particular content check reason - deps: update to browsertrix-behaviors 0.9.0, update to wabac.js (2.23.6) - fixes #860 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
6244515818
commit
549d655173
8 changed files with 70 additions and 18 deletions
|
@ -18,8 +18,8 @@
|
|||
"dependencies": {
|
||||
"@novnc/novnc": "1.4.0",
|
||||
"@puppeteer/replay": "^3.1.1",
|
||||
"@webrecorder/wabac": "^2.23.3",
|
||||
"browsertrix-behaviors": "^0.8.5",
|
||||
"@webrecorder/wabac": "^2.23.6",
|
||||
"browsertrix-behaviors": "^0.9.0",
|
||||
"client-zip": "^2.4.5",
|
||||
"css-selector-parser": "^3.0.5",
|
||||
"fetch-socks": "^1.3.0",
|
||||
|
|
|
@ -645,6 +645,11 @@ export class Crawler {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (await this.crawlState.isFailed()) {
|
||||
logger.error("Crawl failed, no pages crawled successfully");
|
||||
status = "failed";
|
||||
exitCode = ExitCodes.Failed;
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error("Crawl failed", e);
|
||||
exitCode = ExitCodes.Failed;
|
||||
|
@ -938,7 +943,24 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return nextFlowStep(id, page, workerid);
|
||||
});
|
||||
|
||||
// await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data));
|
||||
if (this.params.failOnContentCheck) {
|
||||
await page.exposeFunction(
|
||||
BxFunctionBindings.ContentCheckFailed,
|
||||
(reason: string) => {
|
||||
// if called outside of awaitPageLoad(), ignore
|
||||
if (!opts.data.contentCheckAllowed) {
|
||||
return;
|
||||
}
|
||||
void this.crawlState.setFailReason(reason);
|
||||
logger.fatal(
|
||||
"Content check failed, failing crawl",
|
||||
{ reason },
|
||||
"behavior",
|
||||
ExitCodes.Failed,
|
||||
);
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async setupExecContextEvents(
|
||||
|
@ -1299,7 +1321,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"Seed Page Load Failed, failing crawl",
|
||||
{},
|
||||
"general",
|
||||
1,
|
||||
ExitCodes.GenericError,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -2236,8 +2258,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
await this.netIdle(page, logDetails);
|
||||
|
||||
// allow failing crawl via script only within awaitPageLoad() for now
|
||||
data.contentCheckAllowed = true;
|
||||
|
||||
await this.awaitPageLoad(page.mainFrame(), logDetails);
|
||||
|
||||
data.contentCheckAllowed = false;
|
||||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth, extraHops)) {
|
||||
logger.debug("Skipping Link Extraction, At Max Depth", {}, "links");
|
||||
|
@ -2278,7 +2305,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
frame.evaluate(
|
||||
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
|
||||
),
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
PAGE_OP_TIMEOUT_SECS * 4,
|
||||
"Custom page load check timed out",
|
||||
logDetails,
|
||||
);
|
||||
|
|
|
@ -587,6 +587,13 @@ class ArgParser {
|
|||
default: false,
|
||||
},
|
||||
|
||||
failOnContentCheck: {
|
||||
describe:
|
||||
"If set, allows for behaviors to fail a crawl with custom reason based on content (e.g. logged out)",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
customBehaviors: {
|
||||
describe:
|
||||
"Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" +
|
||||
|
|
|
@ -30,6 +30,8 @@ export enum BxFunctionBindings {
|
|||
|
||||
InitFlow = "__bx_initFlow",
|
||||
NextFlowStep = "__bx_nextFlowStep",
|
||||
|
||||
ContentCheckFailed = "__bx_contentCheckFailed",
|
||||
}
|
||||
|
||||
export const MAX_DEPTH = 1000000;
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
import { getCustomRewriter, getStatusText } from "@webrecorder/wabac";
|
||||
import {
|
||||
getCustomRewriter,
|
||||
getStatusText,
|
||||
ExtraOpts,
|
||||
} from "@webrecorder/wabac";
|
||||
|
||||
import { Protocol } from "puppeteer-core";
|
||||
import { postToGetUrl } from "warcio";
|
||||
|
@ -66,9 +70,7 @@ export class RequestResponseInfo {
|
|||
|
||||
resourceType?: string;
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
extraOpts: Record<string, any> = {};
|
||||
extraOpts: ExtraOpts = {};
|
||||
|
||||
// stats
|
||||
readSize: number = 0;
|
||||
|
|
|
@ -87,6 +87,7 @@ export class PageState {
|
|||
pageSkipped = false;
|
||||
filteredFrames: Frame[] = [];
|
||||
loadState: LoadState = LoadState.FAILED;
|
||||
contentCheckAllowed = false;
|
||||
|
||||
logDetails = {};
|
||||
|
||||
|
@ -447,6 +448,15 @@ return inx;
|
|||
return (await this.queueSize()) == 0 && (await this.numDone()) > 0;
|
||||
}
|
||||
|
||||
async isFailed() {
|
||||
return (
|
||||
(await this.numDone()) === 0 &&
|
||||
(await this.queueSize()) === 0 &&
|
||||
(await this.numPending()) === 0 &&
|
||||
(await this.numFailed()) > 0
|
||||
);
|
||||
}
|
||||
|
||||
async trimToLimit(limit: number) {
|
||||
const totalComplete =
|
||||
(await this.numPending()) +
|
||||
|
@ -465,6 +475,10 @@ return inx;
|
|||
}
|
||||
}
|
||||
|
||||
async setFailReason(reason: string) {
|
||||
await this.redis.set(`${this.key}:failReason`, reason);
|
||||
}
|
||||
|
||||
async setStatus(status_: string) {
|
||||
await this.redis.hset(`${this.key}:status`, this.uid, status_);
|
||||
}
|
||||
|
|
|
@ -101,7 +101,7 @@
|
|||
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
|
||||
|
||||
/* Completeness */
|
||||
"skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
//"skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||
},
|
||||
|
||||
|
|
16
yarn.lock
16
yarn.lock
|
@ -1134,10 +1134,10 @@
|
|||
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
||||
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
||||
|
||||
"@webrecorder/wabac@^2.23.3":
|
||||
version "2.23.3"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.3.tgz#405f53649183c54fd116e334eae2666d6514a341"
|
||||
integrity sha512-NlPNGNmilNf/NEqHbCNPcib4GNnZKQJKK3PIiI0BvEdem/TEjvcn5wEBbUntTYn+VwrhX36QY2HC7Iag+dVnvw==
|
||||
"@webrecorder/wabac@^2.23.6":
|
||||
version "2.23.6"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.6.tgz#cfcf9ef071732de0b71d49b0d3276711e88788a7"
|
||||
integrity sha512-eyRew3ddm0PzzD81racFf1REwTQeoMHAQLreG5+B5OBWBdfFblzlbsUTp4KqiFKHMZ14WXjIxznFcYwCUpkA6w==
|
||||
dependencies:
|
||||
"@peculiar/asn1-ecc" "^2.3.4"
|
||||
"@peculiar/asn1-schema" "^2.3.3"
|
||||
|
@ -1595,10 +1595,10 @@ browserslist@^4.24.0:
|
|||
node-releases "^2.0.18"
|
||||
update-browserslist-db "^1.1.1"
|
||||
|
||||
browsertrix-behaviors@^0.8.5:
|
||||
version "0.8.5"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.5.tgz#f93dc6fed15cb2266664c85eec7f0796c1634fa5"
|
||||
integrity sha512-v6wv6NLJEhj3NbrmGEfOWyXf2TuJgj95Em+KfCTPRJxakTtsvH/A7n2FSNvqMhwusqrjpIR4ch6cEkDp4hblvQ==
|
||||
browsertrix-behaviors@^0.9.0:
|
||||
version "0.9.0"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.0.tgz#3789a07fdf43ca76b4cb4794119d082189338c6a"
|
||||
integrity sha512-rfpgW7r9ApwwH20IbpQrJaWupsfbVgxQRRuARs4m8nzIdF/WKTv38fTHDbYci8wJulcdu8D/eAlzyIBPwhrkkA==
|
||||
dependencies:
|
||||
query-selector-shadow-dom "^1.0.1"
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue