mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
crash page on prompt dialog loop to continue: (#929)
- if a page is stuck in a window.alert / window.prompt loop, showing >10 or more consecutive dialogs (unrelated to unloading), call Page.crash() to more quickly move on to next page, as not much else can be done. - add exception handling in dialog accept/dismiss to avoid crawler crash - fixes #926
This commit is contained in:
parent
8e44b31b45
commit
59df6bbd3f
2 changed files with 39 additions and 16 deletions
|
|
@ -47,6 +47,7 @@ import {
|
|||
ExitCodes,
|
||||
InterruptReason,
|
||||
BxFunctionBindings,
|
||||
MAX_JS_DIALOG_PER_PAGE,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||
|
|
@ -874,30 +875,49 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.browser.addInitScript(page, initScript);
|
||||
}
|
||||
|
||||
let dialogCount = 0;
|
||||
|
||||
// Handle JS dialogs:
|
||||
// - Ensure off-page navigation is canceled while behavior is running
|
||||
// - dismiss close all other dialogs if not blocking unload
|
||||
page.on("dialog", async (dialog) => {
|
||||
let accepted = true;
|
||||
if (dialog.type() === "beforeunload") {
|
||||
if (opts.pageBlockUnload) {
|
||||
accepted = false;
|
||||
await dialog.dismiss();
|
||||
let msg = {};
|
||||
try {
|
||||
if (dialog.type() === "beforeunload") {
|
||||
if (opts.pageBlockUnload) {
|
||||
accepted = false;
|
||||
}
|
||||
} else {
|
||||
await dialog.accept();
|
||||
// other JS dialog, just dismiss
|
||||
accepted = false;
|
||||
if (dialogCount >= MAX_JS_DIALOG_PER_PAGE) {
|
||||
// dialog likely in a loop, need to crash page to avoid being stuck
|
||||
logger.error(
|
||||
"JS Dialog appears to be in a loop, crashing page to continue",
|
||||
);
|
||||
await cdp.send("Page.crash");
|
||||
return;
|
||||
}
|
||||
dialogCount++;
|
||||
}
|
||||
} else {
|
||||
// other JS dialog, just dismiss
|
||||
await dialog.dismiss();
|
||||
msg = {
|
||||
accepted,
|
||||
blockingUnload: opts.pageBlockUnload,
|
||||
message: dialog.message(),
|
||||
type: dialog.type(),
|
||||
page: page.url(),
|
||||
workerid,
|
||||
};
|
||||
if (accepted) {
|
||||
await dialog.accept();
|
||||
} else {
|
||||
await dialog.dismiss();
|
||||
}
|
||||
logger.debug("JS Dialog", msg);
|
||||
} catch (e) {
|
||||
logger.warn("JS Dialog Error", { ...msg, ...formatErr(e) });
|
||||
}
|
||||
logger.debug("JS Dialog", {
|
||||
accepted,
|
||||
blockingUnload: opts.pageBlockUnload,
|
||||
message: dialog.message(),
|
||||
type: dialog.type(),
|
||||
page: page.url(),
|
||||
workerid,
|
||||
});
|
||||
});
|
||||
|
||||
// only add if running with autoclick behavior
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
|||
|
||||
export const ROBOTS_CACHE_LIMIT = 100;
|
||||
|
||||
// max JS dialogs (alert/prompt) to allow per page
|
||||
export const MAX_JS_DIALOG_PER_PAGE = 10;
|
||||
|
||||
export type ExtractSelector = {
|
||||
selector: string;
|
||||
extract: string;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue