instead of puppeteer.exposeFunction(), use cdp function bindings directly to avoid issues custom toJSON overrides:

- add Runtime.addBinding for each exposed function, handle in one place with Runtime.bindingCalled
- convert binding names to BxFunctionBindings enum
- update to browsertrix-behaviors 0.7.1 to avoid waiting for return value
- fixes #770
This commit is contained in:
Ilya Kreymer 2025-02-20 20:49:48 -08:00
parent c25c6771a8
commit cf3a4dcb44
5 changed files with 58 additions and 31 deletions

View file

@ -18,7 +18,7 @@
"dependencies": { "dependencies": {
"@novnc/novnc": "1.4.0", "@novnc/novnc": "1.4.0",
"@webrecorder/wabac": "^2.20.8", "@webrecorder/wabac": "^2.20.8",
"browsertrix-behaviors": "^0.7.0", "browsertrix-behaviors": "^0.7.1",
"client-zip": "^2.4.5", "client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5", "css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0", "fetch-socks": "^1.3.0",

View file

@ -40,9 +40,7 @@ import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
import { Browser } from "./util/browser.js"; import { Browser } from "./util/browser.js";
import { import {
ADD_LINK_FUNC, BxFunctionBindings,
BEHAVIOR_LOG_FUNC,
FETCH_FUNC,
DISPLAY, DISPLAY,
ExtractSelector, ExtractSelector,
PAGE_OP_TIMEOUT_SECS, PAGE_OP_TIMEOUT_SECS,
@ -768,17 +766,43 @@ export class Crawler {
await this.screencaster.screencastPage(page, cdp, workerid); await this.screencaster.screencastPage(page, cdp, workerid);
} }
await page.exposeFunction( cdp.on("Runtime.bindingCalled", (params) => {
ADD_LINK_FUNC, const { name, payload } = params;
(url: string) => callbacks.addLink && callbacks.addLink(url),
); switch (name as BxFunctionBindings) {
case BxFunctionBindings.AddLinkFunc:
callbacks.addLink && callbacks.addLink(payload);
break;
case BxFunctionBindings.BehaviorLogFunc:
{
const logdata: { data: string; type: string } = JSON.parse(payload);
this._behaviorLog(logdata, page.url(), workerid);
}
break;
case BxFunctionBindings.FetchFunc:
if (recorder) {
recorder.addExternalFetch(payload, cdp);
}
break;
case BxFunctionBindings.AddToSeenSet:
this.crawlState
.addToUserSet(payload)
.catch((e) => logger.warn("Adding to URL set error", e));
break;
}
});
await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.AddLinkFunc,
});
if (this.params.behaviorOpts) { if (this.params.behaviorOpts) {
await page.exposeFunction( await cdp.send("Runtime.addBinding", {
BEHAVIOR_LOG_FUNC, name: BxFunctionBindings.BehaviorLogFunc,
(logdata: { data: string; type: string }) => });
this._behaviorLog(logdata, page.url(), workerid),
);
await this.browser.addInitScript(page, behaviors); await this.browser.addInitScript(page, behaviors);
const initScript = ` const initScript = `
@ -791,9 +815,11 @@ self.__bx_behaviors.selectMainBehavior();
this.behaviorsChecked = true; this.behaviorsChecked = true;
} }
await page.exposeFunction(FETCH_FUNC, (url: string) => { if (recorder) {
return recorder ? recorder.addExternalFetch(url, cdp) : true; await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.FetchFunc,
}); });
}
await this.browser.addInitScript(page, initScript); await this.browser.addInitScript(page, initScript);
} }
@ -873,11 +899,9 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
await page.exposeFunction("__bx_addSet", (data: string) => await cdp.send("Runtime.addBinding", {
this.crawlState.addToUserSet(data), name: BxFunctionBindings.AddToSeenSet,
); });
// await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data));
} }
async setupExecContextEvents( async setupExecContextEvents(
@ -2295,7 +2319,7 @@ self.__bx_behaviors.selectMainBehavior();
selector, selector,
extract, extract,
isAttribute, isAttribute,
addLinkFunc: ADD_LINK_FUNC, addLinkFunc: BxFunctionBindings.AddLinkFunc,
}) })
.catch((e) => .catch((e) =>
logger.warn("Link Extraction failed in frame", { logger.warn("Link Extraction failed in frame", {

View file

@ -10,7 +10,6 @@ import { hideBin } from "yargs/helpers";
import { createParser } from "css-selector-parser"; import { createParser } from "css-selector-parser";
import { import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS, WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES, EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS, SERVICE_WORKER_OPTS,
@ -18,6 +17,7 @@ import {
BEHAVIOR_TYPES, BEHAVIOR_TYPES,
ExtractSelector, ExtractSelector,
DEFAULT_MAX_RETRIES, DEFAULT_MAX_RETRIES,
BxFunctionBindings,
} from "./constants.js"; } from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
@ -721,7 +721,7 @@ class ArgParser {
); );
} }
}); });
behaviorOpts.log = BEHAVIOR_LOG_FUNC; behaviorOpts.log = BxFunctionBindings.BehaviorLogFunc;
behaviorOpts.startEarly = true; behaviorOpts.startEarly = true;
behaviorOpts.clickSelector = argv.clickSelector; behaviorOpts.clickSelector = argv.clickSelector;
argv.behaviorOpts = JSON.stringify(behaviorOpts); argv.behaviorOpts = JSON.stringify(behaviorOpts);

View file

@ -22,9 +22,12 @@ export const DETECT_SITEMAP = "<detect>";
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
export const BEHAVIOR_LOG_FUNC = "__bx_log"; export enum BxFunctionBindings {
export const ADD_LINK_FUNC = "__bx_addLink"; BehaviorLogFunc = "__bx_log",
export const FETCH_FUNC = "__bx_fetch"; AddLinkFunc = "__bx_addLink",
FetchFunc = "__bx_fetch",
AddToSeenSet = "__bx_addSet",
}
export const MAX_DEPTH = 1000000; export const MAX_DEPTH = 1000000;
export const DEFAULT_MAX_RETRIES = 2; export const DEFAULT_MAX_RETRIES = 2;

View file

@ -1460,10 +1460,10 @@ browserslist@^4.24.0:
node-releases "^2.0.18" node-releases "^2.0.18"
update-browserslist-db "^1.1.1" update-browserslist-db "^1.1.1"
browsertrix-behaviors@^0.7.0: browsertrix-behaviors@^0.7.1:
version "0.7.0" version "0.7.1"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.0.tgz#a08b7d3e9cd449d0d76b14a438e28472124fd1a4" resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.1.tgz#dcb30c038e4060ef2393eb001ce9f10e3ce71c39"
integrity sha512-t0X74puXJsH8sVkkVZwEdo8L5E1PYtzX/RkVXM4fwwBIL804bOB8WIV+5Dfwov/odaukhB67KZhM00hN60SiBA== integrity sha512-tZ7Bv/IAWzLTNORf/yQqGHpPAQ4tP8sxql8YT491VHlCk939F1YIUrQ36XJOaSyfjmmm2WV9nCMXkDpCsw6zQg==
dependencies: dependencies:
query-selector-shadow-dom "^1.0.1" query-selector-shadow-dom "^1.0.1"