mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Move extractLinks to behaviors + Update to browsertrix-behaviors 0.8.0 (#803)
- extractLinks() now handled via browsertix-behaviors - fixes #770 via browsertrix-behaviors, checks for toJSON overrides - organize exposed functions to enum list
This commit is contained in:
parent
02c4353b4a
commit
e751929a7a
7 changed files with 52 additions and 71 deletions
File diff suppressed because one or more lines are too long
|
@ -18,7 +18,7 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@novnc/novnc": "1.4.0",
|
"@novnc/novnc": "1.4.0",
|
||||||
"@webrecorder/wabac": "^2.20.8",
|
"@webrecorder/wabac": "^2.20.8",
|
||||||
"browsertrix-behaviors": "^0.7.0",
|
"browsertrix-behaviors": "0.8.0",
|
||||||
"client-zip": "^2.4.5",
|
"client-zip": "^2.4.5",
|
||||||
"css-selector-parser": "^3.0.5",
|
"css-selector-parser": "^3.0.5",
|
||||||
"fetch-socks": "^1.3.0",
|
"fetch-socks": "^1.3.0",
|
||||||
|
|
|
@ -40,15 +40,13 @@ import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
ADD_LINK_FUNC,
|
|
||||||
BEHAVIOR_LOG_FUNC,
|
|
||||||
FETCH_FUNC,
|
|
||||||
DISPLAY,
|
DISPLAY,
|
||||||
ExtractSelector,
|
ExtractSelector,
|
||||||
PAGE_OP_TIMEOUT_SECS,
|
PAGE_OP_TIMEOUT_SECS,
|
||||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||||
ExitCodes,
|
ExitCodes,
|
||||||
InterruptReason,
|
InterruptReason,
|
||||||
|
BxFunctionBindings,
|
||||||
} from "./util/constants.js";
|
} from "./util/constants.js";
|
||||||
|
|
||||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||||
|
@ -74,7 +72,7 @@ import {
|
||||||
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
|
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
|
||||||
import { initProxy } from "./util/proxy.js";
|
import { initProxy } from "./util/proxy.js";
|
||||||
|
|
||||||
const behaviors = fs.readFileSync(
|
const btrixBehaviors = fs.readFileSync(
|
||||||
new URL(
|
new URL(
|
||||||
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
||||||
import.meta.url,
|
import.meta.url,
|
||||||
|
@ -769,17 +767,19 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
await page.exposeFunction(
|
await page.exposeFunction(
|
||||||
ADD_LINK_FUNC,
|
BxFunctionBindings.AddLinkFunc,
|
||||||
(url: string) => callbacks.addLink && callbacks.addLink(url),
|
(url: string) => callbacks.addLink && callbacks.addLink(url),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// used for both behaviors and link extraction now
|
||||||
|
await this.browser.addInitScript(page, btrixBehaviors);
|
||||||
|
|
||||||
if (this.params.behaviorOpts) {
|
if (this.params.behaviorOpts) {
|
||||||
await page.exposeFunction(
|
await page.exposeFunction(
|
||||||
BEHAVIOR_LOG_FUNC,
|
BxFunctionBindings.BehaviorLogFunc,
|
||||||
(logdata: { data: string; type: string }) =>
|
(logdata: { data: string; type: string }) =>
|
||||||
this._behaviorLog(logdata, page.url(), workerid),
|
this._behaviorLog(logdata, page.url(), workerid),
|
||||||
);
|
);
|
||||||
await this.browser.addInitScript(page, behaviors);
|
|
||||||
|
|
||||||
const initScript = `
|
const initScript = `
|
||||||
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
|
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
|
||||||
|
@ -791,8 +791,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
this.behaviorsChecked = true;
|
this.behaviorsChecked = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
await page.exposeFunction(FETCH_FUNC, (url: string) => {
|
await page.exposeFunction(BxFunctionBindings.FetchFunc, (url: string) => {
|
||||||
return recorder ? recorder.addExternalFetch(url, cdp) : true;
|
return recorder ? recorder.addExternalFetch(url, cdp) : false;
|
||||||
});
|
});
|
||||||
|
|
||||||
await this.browser.addInitScript(page, initScript);
|
await this.browser.addInitScript(page, initScript);
|
||||||
|
@ -873,7 +873,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await page.exposeFunction("__bx_addSet", (data: string) =>
|
await page.exposeFunction(BxFunctionBindings.AddToSeenSet, (data: string) =>
|
||||||
this.crawlState.addToUserSet(data),
|
this.crawlState.addToUserSet(data),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -2212,22 +2212,24 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
|
async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
|
||||||
logger.debug(
|
if (this.params.behaviorOpts) {
|
||||||
"Waiting for custom page load via behavior",
|
logger.debug(
|
||||||
logDetails,
|
"Waiting for custom page load via behavior",
|
||||||
"behavior",
|
|
||||||
);
|
|
||||||
try {
|
|
||||||
await timedRun(
|
|
||||||
frame.evaluate(
|
|
||||||
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
|
|
||||||
),
|
|
||||||
PAGE_OP_TIMEOUT_SECS,
|
|
||||||
"Custom page load check timed out",
|
|
||||||
logDetails,
|
logDetails,
|
||||||
|
"behavior",
|
||||||
);
|
);
|
||||||
} catch (e) {
|
try {
|
||||||
logger.warn("Waiting for custom page load failed", e, "behavior");
|
await timedRun(
|
||||||
|
frame.evaluate(
|
||||||
|
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
|
||||||
|
),
|
||||||
|
PAGE_OP_TIMEOUT_SECS,
|
||||||
|
"Custom page load check timed out",
|
||||||
|
logDetails,
|
||||||
|
);
|
||||||
|
} catch (e) {
|
||||||
|
logger.warn("Waiting for custom page load failed", e, "behavior");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.postLoadDelay) {
|
if (this.params.postLoadDelay) {
|
||||||
|
@ -2257,46 +2259,18 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
const loadLinks = (options: {
|
|
||||||
selector: string;
|
|
||||||
extract: string;
|
|
||||||
isAttribute: boolean;
|
|
||||||
addLinkFunc: string;
|
|
||||||
}) => {
|
|
||||||
const { selector, extract, isAttribute, addLinkFunc } = options;
|
|
||||||
const urls = new Set<string>();
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const getAttr = (elem: any) => urls.add(elem.getAttribute(extract));
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const getProp = (elem: any) => urls.add(elem[extract]);
|
|
||||||
|
|
||||||
const getter = isAttribute ? getAttr : getProp;
|
|
||||||
|
|
||||||
document.querySelectorAll(selector).forEach(getter);
|
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const func = (window as any)[addLinkFunc] as (
|
|
||||||
url: string,
|
|
||||||
) => NonNullable<unknown>;
|
|
||||||
urls.forEach((url) => func.call(this, url));
|
|
||||||
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
|
|
||||||
const frames = filteredFrames || page.frames();
|
const frames = filteredFrames || page.frames();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (const { selector, extract, isAttribute } of selectors) {
|
for (const { selector, extract, attrOnly } of selectors) {
|
||||||
await Promise.allSettled(
|
await Promise.allSettled(
|
||||||
frames.map((frame) => {
|
frames.map((frame) => {
|
||||||
const getLinks = frame
|
const getLinks = frame
|
||||||
.evaluate(loadLinks, {
|
.evaluate(
|
||||||
selector,
|
`self.__bx_behaviors.extractLinks(${JSON.stringify(
|
||||||
extract,
|
selector,
|
||||||
isAttribute,
|
)}, ${JSON.stringify(extract)}, ${attrOnly})`,
|
||||||
addLinkFunc: ADD_LINK_FUNC,
|
)
|
||||||
})
|
|
||||||
.catch((e) =>
|
.catch((e) =>
|
||||||
logger.warn("Link Extraction failed in frame", {
|
logger.warn("Link Extraction failed in frame", {
|
||||||
frameUrl: frame.url,
|
frameUrl: frame.url,
|
||||||
|
|
|
@ -10,7 +10,6 @@ import { hideBin } from "yargs/helpers";
|
||||||
import { createParser } from "css-selector-parser";
|
import { createParser } from "css-selector-parser";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
BEHAVIOR_LOG_FUNC,
|
|
||||||
WAIT_UNTIL_OPTS,
|
WAIT_UNTIL_OPTS,
|
||||||
EXTRACT_TEXT_TYPES,
|
EXTRACT_TEXT_TYPES,
|
||||||
SERVICE_WORKER_OPTS,
|
SERVICE_WORKER_OPTS,
|
||||||
|
@ -18,6 +17,7 @@ import {
|
||||||
BEHAVIOR_TYPES,
|
BEHAVIOR_TYPES,
|
||||||
ExtractSelector,
|
ExtractSelector,
|
||||||
DEFAULT_MAX_RETRIES,
|
DEFAULT_MAX_RETRIES,
|
||||||
|
BxFunctionBindings,
|
||||||
} from "./constants.js";
|
} from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
|
@ -734,7 +734,7 @@ class ArgParser {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
behaviorOpts.log = BxFunctionBindings.BehaviorLogFunc;
|
||||||
behaviorOpts.startEarly = true;
|
behaviorOpts.startEarly = true;
|
||||||
behaviorOpts.clickSelector = argv.clickSelector;
|
behaviorOpts.clickSelector = argv.clickSelector;
|
||||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||||
|
|
|
@ -22,9 +22,12 @@ export const DETECT_SITEMAP = "<detect>";
|
||||||
|
|
||||||
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
||||||
|
|
||||||
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
export enum BxFunctionBindings {
|
||||||
export const ADD_LINK_FUNC = "__bx_addLink";
|
BehaviorLogFunc = "__bx_log",
|
||||||
export const FETCH_FUNC = "__bx_fetch";
|
AddLinkFunc = "__bx_addLink",
|
||||||
|
FetchFunc = "__bx_fetch",
|
||||||
|
AddToSeenSet = "__bx_addSet",
|
||||||
|
}
|
||||||
|
|
||||||
export const MAX_DEPTH = 1000000;
|
export const MAX_DEPTH = 1000000;
|
||||||
export const DEFAULT_MAX_RETRIES = 2;
|
export const DEFAULT_MAX_RETRIES = 2;
|
||||||
|
@ -36,14 +39,14 @@ export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
||||||
export type ExtractSelector = {
|
export type ExtractSelector = {
|
||||||
selector: string;
|
selector: string;
|
||||||
extract: string;
|
extract: string;
|
||||||
isAttribute: boolean;
|
attrOnly: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const DEFAULT_SELECTORS: ExtractSelector[] = [
|
export const DEFAULT_SELECTORS: ExtractSelector[] = [
|
||||||
{
|
{
|
||||||
selector: "a[href]",
|
selector: "a[href]",
|
||||||
extract: "href",
|
extract: "href",
|
||||||
isAttribute: false,
|
attrOnly: false,
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
|
@ -868,6 +868,11 @@ export class Recorder {
|
||||||
}
|
}
|
||||||
|
|
||||||
addExternalFetch(url: string, cdp: CDPSession) {
|
addExternalFetch(url: string, cdp: CDPSession) {
|
||||||
|
logger.debug(
|
||||||
|
"Handling fetch from behavior",
|
||||||
|
{ url, ...this.logDetails },
|
||||||
|
"recorder",
|
||||||
|
);
|
||||||
const reqresp = new RequestResponseInfo("0");
|
const reqresp = new RequestResponseInfo("0");
|
||||||
reqresp.url = url;
|
reqresp.url = url;
|
||||||
reqresp.method = "GET";
|
reqresp.method = "GET";
|
||||||
|
|
|
@ -1460,10 +1460,10 @@ browserslist@^4.24.0:
|
||||||
node-releases "^2.0.18"
|
node-releases "^2.0.18"
|
||||||
update-browserslist-db "^1.1.1"
|
update-browserslist-db "^1.1.1"
|
||||||
|
|
||||||
browsertrix-behaviors@^0.7.0:
|
browsertrix-behaviors@0.8.0:
|
||||||
version "0.7.0"
|
version "0.8.0"
|
||||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.0.tgz#a08b7d3e9cd449d0d76b14a438e28472124fd1a4"
|
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.0.tgz#2e84a323f065bb4281ddfe6e3a96efbf18fb9b3c"
|
||||||
integrity sha512-t0X74puXJsH8sVkkVZwEdo8L5E1PYtzX/RkVXM4fwwBIL804bOB8WIV+5Dfwov/odaukhB67KZhM00hN60SiBA==
|
integrity sha512-aakiuTBf0SuX8P48/dbWZYPJ6TMGdqLEO9+Z3QfUpu4viBI7xY1necuJlTOuW4963AR/5aZCaLjfMNF/lOvl9w==
|
||||||
dependencies:
|
dependencies:
|
||||||
query-selector-shadow-dom "^1.0.1"
|
query-selector-shadow-dom "^1.0.1"
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue