Move extractLinks to behaviors + Update to browsertrix-behaviors 0.8.0 (#803)

- extractLinks() now handled via browsertix-behaviors
- fixes #770 via browsertrix-behaviors, checks for toJSON overrides
- organize exposed functions to enum list
This commit is contained in:
Ilya Kreymer 2025-03-31 12:02:25 -07:00 committed by GitHub
parent 02c4353b4a
commit e751929a7a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 52 additions and 71 deletions

File diff suppressed because one or more lines are too long

View file

@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@webrecorder/wabac": "^2.20.8",
"browsertrix-behaviors": "^0.7.0",
"browsertrix-behaviors": "0.8.0",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",

View file

@ -40,15 +40,13 @@ import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
import { Browser } from "./util/browser.js";
import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
FETCH_FUNC,
DISPLAY,
ExtractSelector,
PAGE_OP_TIMEOUT_SECS,
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
ExitCodes,
InterruptReason,
BxFunctionBindings,
} from "./util/constants.js";
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
@ -74,7 +72,7 @@ import {
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
import { initProxy } from "./util/proxy.js";
const behaviors = fs.readFileSync(
const btrixBehaviors = fs.readFileSync(
new URL(
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
import.meta.url,
@ -769,17 +767,19 @@ export class Crawler {
}
await page.exposeFunction(
ADD_LINK_FUNC,
BxFunctionBindings.AddLinkFunc,
(url: string) => callbacks.addLink && callbacks.addLink(url),
);
// used for both behaviors and link extraction now
await this.browser.addInitScript(page, btrixBehaviors);
if (this.params.behaviorOpts) {
await page.exposeFunction(
BEHAVIOR_LOG_FUNC,
BxFunctionBindings.BehaviorLogFunc,
(logdata: { data: string; type: string }) =>
this._behaviorLog(logdata, page.url(), workerid),
);
await this.browser.addInitScript(page, behaviors);
const initScript = `
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
@ -791,8 +791,8 @@ self.__bx_behaviors.selectMainBehavior();
this.behaviorsChecked = true;
}
await page.exposeFunction(FETCH_FUNC, (url: string) => {
return recorder ? recorder.addExternalFetch(url, cdp) : true;
await page.exposeFunction(BxFunctionBindings.FetchFunc, (url: string) => {
return recorder ? recorder.addExternalFetch(url, cdp) : false;
});
await this.browser.addInitScript(page, initScript);
@ -873,7 +873,7 @@ self.__bx_behaviors.selectMainBehavior();
}
}
await page.exposeFunction("__bx_addSet", (data: string) =>
await page.exposeFunction(BxFunctionBindings.AddToSeenSet, (data: string) =>
this.crawlState.addToUserSet(data),
);
@ -2212,22 +2212,24 @@ self.__bx_behaviors.selectMainBehavior();
}
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
logger.debug(
"Waiting for custom page load via behavior",
logDetails,
"behavior",
);
try {
await timedRun(
frame.evaluate(
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
),
PAGE_OP_TIMEOUT_SECS,
"Custom page load check timed out",
if (this.params.behaviorOpts) {
logger.debug(
"Waiting for custom page load via behavior",
logDetails,
"behavior",
);
} catch (e) {
logger.warn("Waiting for custom page load failed", e, "behavior");
try {
await timedRun(
frame.evaluate(
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
),
PAGE_OP_TIMEOUT_SECS,
"Custom page load check timed out",
logDetails,
);
} catch (e) {
logger.warn("Waiting for custom page load failed", e, "behavior");
}
}
if (this.params.postLoadDelay) {
@ -2257,46 +2259,18 @@ self.__bx_behaviors.selectMainBehavior();
);
};
const loadLinks = (options: {
selector: string;
extract: string;
isAttribute: boolean;
addLinkFunc: string;
}) => {
const { selector, extract, isAttribute, addLinkFunc } = options;
const urls = new Set<string>();
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const getAttr = (elem: any) => urls.add(elem.getAttribute(extract));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const getProp = (elem: any) => urls.add(elem[extract]);
const getter = isAttribute ? getAttr : getProp;
document.querySelectorAll(selector).forEach(getter);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const func = (window as any)[addLinkFunc] as (
url: string,
) => NonNullable<unknown>;
urls.forEach((url) => func.call(this, url));
return true;
};
const frames = filteredFrames || page.frames();
try {
for (const { selector, extract, isAttribute } of selectors) {
for (const { selector, extract, attrOnly } of selectors) {
await Promise.allSettled(
frames.map((frame) => {
const getLinks = frame
.evaluate(loadLinks, {
selector,
extract,
isAttribute,
addLinkFunc: ADD_LINK_FUNC,
})
.evaluate(
`self.__bx_behaviors.extractLinks(${JSON.stringify(
selector,
)}, ${JSON.stringify(extract)}, ${attrOnly})`,
)
.catch((e) =>
logger.warn("Link Extraction failed in frame", {
frameUrl: frame.url,

View file

@ -10,7 +10,6 @@ import { hideBin } from "yargs/helpers";
import { createParser } from "css-selector-parser";
import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
@ -18,6 +17,7 @@ import {
BEHAVIOR_TYPES,
ExtractSelector,
DEFAULT_MAX_RETRIES,
BxFunctionBindings,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
@ -734,7 +734,7 @@ class ArgParser {
);
}
});
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
behaviorOpts.log = BxFunctionBindings.BehaviorLogFunc;
behaviorOpts.startEarly = true;
behaviorOpts.clickSelector = argv.clickSelector;
argv.behaviorOpts = JSON.stringify(behaviorOpts);

View file

@ -22,9 +22,12 @@ export const DETECT_SITEMAP = "<detect>";
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
export const BEHAVIOR_LOG_FUNC = "__bx_log";
export const ADD_LINK_FUNC = "__bx_addLink";
export const FETCH_FUNC = "__bx_fetch";
export enum BxFunctionBindings {
BehaviorLogFunc = "__bx_log",
AddLinkFunc = "__bx_addLink",
FetchFunc = "__bx_fetch",
AddToSeenSet = "__bx_addSet",
}
export const MAX_DEPTH = 1000000;
export const DEFAULT_MAX_RETRIES = 2;
@ -36,14 +39,14 @@ export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
export type ExtractSelector = {
selector: string;
extract: string;
isAttribute: boolean;
attrOnly: boolean;
};
export const DEFAULT_SELECTORS: ExtractSelector[] = [
{
selector: "a[href]",
extract: "href",
isAttribute: false,
attrOnly: false,
},
];

View file

@ -868,6 +868,11 @@ export class Recorder {
}
addExternalFetch(url: string, cdp: CDPSession) {
logger.debug(
"Handling fetch from behavior",
{ url, ...this.logDetails },
"recorder",
);
const reqresp = new RequestResponseInfo("0");
reqresp.url = url;
reqresp.method = "GET";

View file

@ -1460,10 +1460,10 @@ browserslist@^4.24.0:
node-releases "^2.0.18"
update-browserslist-db "^1.1.1"
browsertrix-behaviors@^0.7.0:
version "0.7.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.0.tgz#a08b7d3e9cd449d0d76b14a438e28472124fd1a4"
integrity sha512-t0X74puXJsH8sVkkVZwEdo8L5E1PYtzX/RkVXM4fwwBIL804bOB8WIV+5Dfwov/odaukhB67KZhM00hN60SiBA==
browsertrix-behaviors@0.8.0:
version "0.8.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.0.tgz#2e84a323f065bb4281ddfe6e3a96efbf18fb9b3c"
integrity sha512-aakiuTBf0SuX8P48/dbWZYPJ6TMGdqLEO9+Z3QfUpu4viBI7xY1necuJlTOuW4963AR/5aZCaLjfMNF/lOvl9w==
dependencies:
query-selector-shadow-dom "^1.0.1"