mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
crash page on prompt dialog loop to continue: (#929)
- if a page is stuck in a window.alert / window.prompt loop, showing >10 or more consecutive dialogs (unrelated to unloading), call Page.crash() to more quickly move on to next page, as not much else can be done. - add exception handling in dialog accept/dismiss to avoid crawler crash - fixes #926
This commit is contained in:
parent
8e44b31b45
commit
59df6bbd3f
2 changed files with 39 additions and 16 deletions
|
|
@ -47,6 +47,7 @@ import {
|
||||||
ExitCodes,
|
ExitCodes,
|
||||||
InterruptReason,
|
InterruptReason,
|
||||||
BxFunctionBindings,
|
BxFunctionBindings,
|
||||||
|
MAX_JS_DIALOG_PER_PAGE,
|
||||||
} from "./util/constants.js";
|
} from "./util/constants.js";
|
||||||
|
|
||||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||||
|
|
@ -874,30 +875,49 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.browser.addInitScript(page, initScript);
|
await this.browser.addInitScript(page, initScript);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let dialogCount = 0;
|
||||||
|
|
||||||
// Handle JS dialogs:
|
// Handle JS dialogs:
|
||||||
// - Ensure off-page navigation is canceled while behavior is running
|
// - Ensure off-page navigation is canceled while behavior is running
|
||||||
// - dismiss close all other dialogs if not blocking unload
|
// - dismiss close all other dialogs if not blocking unload
|
||||||
page.on("dialog", async (dialog) => {
|
page.on("dialog", async (dialog) => {
|
||||||
let accepted = true;
|
let accepted = true;
|
||||||
if (dialog.type() === "beforeunload") {
|
let msg = {};
|
||||||
if (opts.pageBlockUnload) {
|
try {
|
||||||
accepted = false;
|
if (dialog.type() === "beforeunload") {
|
||||||
await dialog.dismiss();
|
if (opts.pageBlockUnload) {
|
||||||
|
accepted = false;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
await dialog.accept();
|
// other JS dialog, just dismiss
|
||||||
|
accepted = false;
|
||||||
|
if (dialogCount >= MAX_JS_DIALOG_PER_PAGE) {
|
||||||
|
// dialog likely in a loop, need to crash page to avoid being stuck
|
||||||
|
logger.error(
|
||||||
|
"JS Dialog appears to be in a loop, crashing page to continue",
|
||||||
|
);
|
||||||
|
await cdp.send("Page.crash");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
dialogCount++;
|
||||||
}
|
}
|
||||||
} else {
|
msg = {
|
||||||
// other JS dialog, just dismiss
|
accepted,
|
||||||
await dialog.dismiss();
|
blockingUnload: opts.pageBlockUnload,
|
||||||
|
message: dialog.message(),
|
||||||
|
type: dialog.type(),
|
||||||
|
page: page.url(),
|
||||||
|
workerid,
|
||||||
|
};
|
||||||
|
if (accepted) {
|
||||||
|
await dialog.accept();
|
||||||
|
} else {
|
||||||
|
await dialog.dismiss();
|
||||||
|
}
|
||||||
|
logger.debug("JS Dialog", msg);
|
||||||
|
} catch (e) {
|
||||||
|
logger.warn("JS Dialog Error", { ...msg, ...formatErr(e) });
|
||||||
}
|
}
|
||||||
logger.debug("JS Dialog", {
|
|
||||||
accepted,
|
|
||||||
blockingUnload: opts.pageBlockUnload,
|
|
||||||
message: dialog.message(),
|
|
||||||
type: dialog.type(),
|
|
||||||
page: page.url(),
|
|
||||||
workerid,
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// only add if running with autoclick behavior
|
// only add if running with autoclick behavior
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,9 @@ export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
||||||
|
|
||||||
export const ROBOTS_CACHE_LIMIT = 100;
|
export const ROBOTS_CACHE_LIMIT = 100;
|
||||||
|
|
||||||
|
// max JS dialogs (alert/prompt) to allow per page
|
||||||
|
export const MAX_JS_DIALOG_PER_PAGE = 10;
|
||||||
|
|
||||||
export type ExtractSelector = {
|
export type ExtractSelector = {
|
||||||
selector: string;
|
selector: string;
|
||||||
extract: string;
|
extract: string;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue