better failure detection, allow update support for captcha detection via behaviors (#917)

- allow fail on content check from main behavior
- update to behaviors 0.9.6 to support 'captcha_found' content check for
tiktok
- allow throwing from timedRun
- call fatal() if profile can not be extracted
This commit is contained in:
Ilya Kreymer 2025-11-19 15:49:49 -08:00 committed by GitHub
parent 87edef3362
commit 565ba54454
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 23 additions and 8 deletions

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.9.1",
"version": "1.9.2",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
@ -19,7 +19,7 @@
"@novnc/novnc": "1.4.0",
"@puppeteer/replay": "^3.1.3",
"@webrecorder/wabac": "^2.24.5",
"browsertrix-behaviors": "^0.9.5",
"browsertrix-behaviors": "^0.9.6",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",

View file

@ -1209,6 +1209,9 @@ self.__bx_behaviors.selectMainBehavior();
if (data.skipBehaviors) {
logger.warn("Skipping behaviors for slow page", logDetails, "behavior");
} else {
// allow failing crawl via script from within behaviors also
data.contentCheckAllowed = true;
const res = await timedRun(
this.runBehaviors(
page,
@ -1224,6 +1227,8 @@ self.__bx_behaviors.selectMainBehavior();
true,
);
data.contentCheckAllowed = false;
await this.netIdle(page, logDetails);
if (res) {
@ -2296,7 +2301,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.netIdle(page, logDetails);
// allow failing crawl via script only within awaitPageLoad() for now
// allow failing crawl via script only within awaitPageLoad()
data.contentCheckAllowed = true;
await this.awaitPageLoad(page.mainFrame(), logDetails);

View file

@ -236,7 +236,11 @@ export class Browser {
this.removeSingletons();
return true;
} catch (e) {
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
logger.fatal(
`Profile filename ${profileFilename} not a valid tar.gz, can not load profile, exiting`,
{},
"browser",
);
}
}

View file

@ -167,6 +167,7 @@ export class S3StorageSync {
{},
"storage",
true,
true,
);
break;
} catch (e) {

View file

@ -14,8 +14,10 @@ export function timedRun(
logDetails = {},
context: LogContext = "general",
isWarn = false,
throwOnTimeout = false,
) {
// return Promise return value or log error if timeout is reached first
// or throw if throwOnTimeout is true!
const timeout = seconds * 1000;
let tm: NodeJS.Timeout;
@ -36,6 +38,9 @@ export function timedRun(
{ seconds: seconds, ...logDetails },
context,
);
if (throwOnTimeout) {
throw err;
}
} else {
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
throw err;

View file

@ -1606,10 +1606,10 @@ browserslist@^4.24.0:
node-releases "^2.0.18"
update-browserslist-db "^1.1.1"
browsertrix-behaviors@^0.9.5:
version "0.9.5"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.5.tgz#abe089b8f188d452387bf7b2e3b93f58735ca331"
integrity sha512-vWHY4wx9kO4JkP/Lbvq23vyFJRMpPZTGNN/SotJsRxefAs7LE29moMhAWHLTeKusJ6AghP/PhsxV447yFX8CrA==
browsertrix-behaviors@^0.9.6:
version "0.9.6"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.6.tgz#9ebf583c4512172e8a5a1d35aa4f884b25eaa339"
integrity sha512-Ar4OBkJDRLkA4gpVv8BmcryzUdVc8tGeIkTY90eGkkeNM1wMennr2/XoEYWpX4zQlwkG7tLxQD0vQujtRNdvqA==
dependencies:
query-selector-shadow-dom "^1.0.1"