mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
better failure detection, allow update support for captcha detection via behaviors (#917)
- allow fail on content check from main behavior - update to behaviors 0.9.6 to support 'captcha_found' content check for tiktok - allow throwing from timedRun - call fatal() if profile can not be extracted
This commit is contained in:
parent
87edef3362
commit
565ba54454
6 changed files with 23 additions and 8 deletions
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.9.1",
|
||||
"version": "1.9.2",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
@ -19,7 +19,7 @@
|
|||
"@novnc/novnc": "1.4.0",
|
||||
"@puppeteer/replay": "^3.1.3",
|
||||
"@webrecorder/wabac": "^2.24.5",
|
||||
"browsertrix-behaviors": "^0.9.5",
|
||||
"browsertrix-behaviors": "^0.9.6",
|
||||
"client-zip": "^2.4.5",
|
||||
"css-selector-parser": "^3.0.5",
|
||||
"fetch-socks": "^1.3.0",
|
||||
|
|
|
|||
|
|
@ -1209,6 +1209,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (data.skipBehaviors) {
|
||||
logger.warn("Skipping behaviors for slow page", logDetails, "behavior");
|
||||
} else {
|
||||
// allow failing crawl via script from within behaviors also
|
||||
data.contentCheckAllowed = true;
|
||||
|
||||
const res = await timedRun(
|
||||
this.runBehaviors(
|
||||
page,
|
||||
|
|
@ -1224,6 +1227,8 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
true,
|
||||
);
|
||||
|
||||
data.contentCheckAllowed = false;
|
||||
|
||||
await this.netIdle(page, logDetails);
|
||||
|
||||
if (res) {
|
||||
|
|
@ -2296,7 +2301,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
await this.netIdle(page, logDetails);
|
||||
|
||||
// allow failing crawl via script only within awaitPageLoad() for now
|
||||
// allow failing crawl via script only within awaitPageLoad()
|
||||
data.contentCheckAllowed = true;
|
||||
|
||||
await this.awaitPageLoad(page.mainFrame(), logDetails);
|
||||
|
|
|
|||
|
|
@ -236,7 +236,11 @@ export class Browser {
|
|||
this.removeSingletons();
|
||||
return true;
|
||||
} catch (e) {
|
||||
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||
logger.fatal(
|
||||
`Profile filename ${profileFilename} not a valid tar.gz, can not load profile, exiting`,
|
||||
{},
|
||||
"browser",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -167,6 +167,7 @@ export class S3StorageSync {
|
|||
{},
|
||||
"storage",
|
||||
true,
|
||||
true,
|
||||
);
|
||||
break;
|
||||
} catch (e) {
|
||||
|
|
|
|||
|
|
@ -14,8 +14,10 @@ export function timedRun(
|
|||
logDetails = {},
|
||||
context: LogContext = "general",
|
||||
isWarn = false,
|
||||
throwOnTimeout = false,
|
||||
) {
|
||||
// return Promise return value or log error if timeout is reached first
|
||||
// or throw if throwOnTimeout is true!
|
||||
const timeout = seconds * 1000;
|
||||
|
||||
let tm: NodeJS.Timeout;
|
||||
|
|
@ -36,6 +38,9 @@ export function timedRun(
|
|||
{ seconds: seconds, ...logDetails },
|
||||
context,
|
||||
);
|
||||
if (throwOnTimeout) {
|
||||
throw err;
|
||||
}
|
||||
} else {
|
||||
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
||||
throw err;
|
||||
|
|
|
|||
|
|
@ -1606,10 +1606,10 @@ browserslist@^4.24.0:
|
|||
node-releases "^2.0.18"
|
||||
update-browserslist-db "^1.1.1"
|
||||
|
||||
browsertrix-behaviors@^0.9.5:
|
||||
version "0.9.5"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.5.tgz#abe089b8f188d452387bf7b2e3b93f58735ca331"
|
||||
integrity sha512-vWHY4wx9kO4JkP/Lbvq23vyFJRMpPZTGNN/SotJsRxefAs7LE29moMhAWHLTeKusJ6AghP/PhsxV447yFX8CrA==
|
||||
browsertrix-behaviors@^0.9.6:
|
||||
version "0.9.6"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.6.tgz#9ebf583c4512172e8a5a1d35aa4f884b25eaa339"
|
||||
integrity sha512-Ar4OBkJDRLkA4gpVv8BmcryzUdVc8tGeIkTY90eGkkeNM1wMennr2/XoEYWpX4zQlwkG7tLxQD0vQujtRNdvqA==
|
||||
dependencies:
|
||||
query-selector-shadow-dom "^1.0.1"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue