mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Support loading custom behaviors from URLs and/or filepaths (#707)
Fixes #368 The `--customBehaviors` flag is now an array, making it repeatable. This should be backwards compatible with the CLI flag, but may require changes to YAML configs when custom behaviors are used. Custom behaviors can be loaded from URLs, local filepaths, and paths to local directories, including any combination thereof. New tests are added to ensure loading behaviors from URLs as well as a mixed combination of URL and filepath works as expected. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
e5bab8e7c8
commit
2a9b152531
8 changed files with 142 additions and 40 deletions
|
@ -50,6 +50,8 @@ RUN ln -s /app/dist/main.js /usr/bin/crawl; \
|
|||
ln -s /app/dist/main.js /usr/bin/qa; \
|
||||
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
|
||||
|
||||
RUN mkdir -p /app/behaviors
|
||||
|
||||
WORKDIR /crawls
|
||||
|
||||
# enable to test custom behaviors build (from browsertrix-behaviors)
|
||||
|
|
|
@ -246,9 +246,11 @@ Options:
|
|||
ailOnFailedSeed may result in crawl
|
||||
failing due to non-200 responses
|
||||
[boolean] [default: false]
|
||||
--customBehaviors injects a custom behavior file or se
|
||||
t of behavior files in a directory
|
||||
[string]
|
||||
--customBehaviors Custom behavior files to inject. Val
|
||||
ues can be URLs, paths to individual
|
||||
behavior files, or paths to a direct
|
||||
ory of behavior files.
|
||||
[array] [default: []]
|
||||
--debugAccessRedis if set, runs internal redis without
|
||||
protected mode to allow external acc
|
||||
ess (for debugging) [boolean]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.3.4",
|
||||
"version": "1.4.0-beta.0",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
|
@ -39,7 +39,7 @@ import {
|
|||
runWorkers,
|
||||
} from "./util/worker.js";
|
||||
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
||||
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
|
||||
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
|
||||
|
||||
import { Browser } from "./util/browser.js";
|
||||
|
||||
|
@ -511,7 +511,7 @@ export class Crawler {
|
|||
}
|
||||
|
||||
if (this.params.customBehaviors) {
|
||||
this.customBehaviors = this.loadCustomBehaviors(
|
||||
this.customBehaviors = await this.loadCustomBehaviors(
|
||||
this.params.customBehaviors,
|
||||
);
|
||||
}
|
||||
|
@ -801,10 +801,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
});
|
||||
}
|
||||
|
||||
loadCustomBehaviors(filename: string) {
|
||||
async loadCustomBehaviors(sources: string[]) {
|
||||
let str = "";
|
||||
|
||||
for (const { contents } of collectAllFileSources(filename, ".js")) {
|
||||
for (const { contents } of await collectCustomBehaviors(sources)) {
|
||||
str += `self.__bx_behaviors.load(${contents});\n`;
|
||||
}
|
||||
|
||||
|
@ -812,13 +812,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
async checkBehaviorScripts(cdp: CDPSession) {
|
||||
const filename = this.params.customBehaviors;
|
||||
const sources = this.params.customBehaviors;
|
||||
|
||||
if (!filename) {
|
||||
if (!sources) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const { path, contents } of collectAllFileSources(filename, ".js")) {
|
||||
for (const { path, contents } of await collectCustomBehaviors(sources)) {
|
||||
await this.browser.checkScript(cdp, path, contents);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -551,8 +551,10 @@ class ArgParser {
|
|||
|
||||
customBehaviors: {
|
||||
describe:
|
||||
"injects a custom behavior file or set of behavior files in a directory",
|
||||
type: "string",
|
||||
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
|
||||
" to a directory of behavior files",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
debugAccessRedis: {
|
||||
|
|
|
@ -1,27 +1,83 @@
|
|||
import fs from "fs";
|
||||
import fsp from "fs/promises";
|
||||
import path from "path";
|
||||
import crypto from "crypto";
|
||||
import { fetch } from "undici";
|
||||
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
const MAX_DEPTH = 2;
|
||||
|
||||
export function collectAllFileSources(
|
||||
// Add .ts to allowed extensions when we can support it
|
||||
const ALLOWED_EXTS = [".js"];
|
||||
|
||||
export type FileSource = {
|
||||
path: string;
|
||||
contents: string;
|
||||
};
|
||||
|
||||
export type FileSources = FileSource[];
|
||||
|
||||
export async function collectCustomBehaviors(
|
||||
sources: string[],
|
||||
): Promise<FileSources> {
|
||||
const collectedSources: FileSources = [];
|
||||
|
||||
for (const fileSource of sources) {
|
||||
if (fileSource.startsWith("http")) {
|
||||
const newSources = await collectOnlineBehavior(fileSource);
|
||||
collectedSources.push(...newSources);
|
||||
} else {
|
||||
const newSources = await collectLocalPathBehaviors(fileSource);
|
||||
collectedSources.push(...newSources);
|
||||
}
|
||||
}
|
||||
|
||||
return collectedSources;
|
||||
}
|
||||
|
||||
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
||||
const filename = crypto.randomBytes(4).toString("hex") + ".js";
|
||||
const behaviorFilepath = `/app/behaviors/${filename}`;
|
||||
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
const fileContents = await res.text();
|
||||
await fsp.writeFile(behaviorFilepath, fileContents);
|
||||
logger.info(
|
||||
"Custom behavior file downloaded",
|
||||
{ url, path: behaviorFilepath },
|
||||
"behavior",
|
||||
);
|
||||
return await collectLocalPathBehaviors(behaviorFilepath);
|
||||
} catch (e) {
|
||||
logger.error(
|
||||
"Error downloading custom behavior from URL",
|
||||
{ url, error: e },
|
||||
"behavior",
|
||||
);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
async function collectLocalPathBehaviors(
|
||||
fileOrDir: string,
|
||||
ext?: string,
|
||||
depth = 0,
|
||||
): { path: string; contents: string }[] {
|
||||
): Promise<FileSources> {
|
||||
const resolvedPath = path.resolve(fileOrDir);
|
||||
|
||||
if (depth >= MAX_DEPTH) {
|
||||
console.warn(
|
||||
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
||||
logger.warn(
|
||||
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
||||
{},
|
||||
"behavior",
|
||||
);
|
||||
return [];
|
||||
}
|
||||
|
||||
const stat = fs.statSync(resolvedPath);
|
||||
const stat = await fsp.stat(resolvedPath);
|
||||
|
||||
if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
|
||||
const contents = fs.readFileSync(resolvedPath);
|
||||
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
|
||||
const contents = await fsp.readFile(resolvedPath);
|
||||
return [
|
||||
{
|
||||
path: resolvedPath,
|
||||
|
@ -30,24 +86,28 @@ export function collectAllFileSources(
|
|||
];
|
||||
}
|
||||
|
||||
if (stat.isDirectory()) {
|
||||
const files = fs.readdirSync(resolvedPath);
|
||||
return files.reduce(
|
||||
(acc: { path: string; contents: string }[], next: string) => {
|
||||
const nextPath = path.join(fileOrDir, next);
|
||||
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
|
||||
},
|
||||
[],
|
||||
const behaviors: FileSources = [];
|
||||
|
||||
const isDir = stat.isDirectory();
|
||||
|
||||
if (!isDir && depth === 0) {
|
||||
logger.warn(
|
||||
"The provided path is not a .js file or directory",
|
||||
{ path: resolvedPath },
|
||||
"behavior",
|
||||
);
|
||||
}
|
||||
|
||||
if (depth === 0) {
|
||||
console.warn(
|
||||
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
|
||||
);
|
||||
if (isDir) {
|
||||
const files = await fsp.readdir(resolvedPath);
|
||||
for (const file of files) {
|
||||
const filePath = path.join(resolvedPath, file);
|
||||
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
|
||||
behaviors.push(...newBehaviors);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
return behaviors;
|
||||
}
|
||||
|
||||
export async function getInfoString() {
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
test("test custom behaviors", async () => {
|
||||
test("test custom behaviors from local filepath", async () => {
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// custom behavior ran for example.com
|
||||
// custom behavior ran for specs.webrecorder.net
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
|
||||
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
|
@ -35,6 +35,42 @@ test("test custom behaviors", async () => {
|
|||
).toBe(true);
|
||||
});
|
||||
|
||||
test("test custom behavior from URL", async () => {
|
||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page");
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("test mixed custom behavior sources", async () => {
|
||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page");
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// test custom behavior from url ran
|
||||
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// test custom behavior from local file ran
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("test invalid behavior exit", async () => {
|
||||
let status = 0;
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ class TestBehavior {
|
|||
}
|
||||
|
||||
static isMatch() {
|
||||
return window.location.origin === "https://example.com";
|
||||
return window.location.origin === "https://specs.webrecorder.net";
|
||||
}
|
||||
|
||||
async *run(ctx) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue