Support loading custom behaviors from URLs and/or filepaths (#707)

Fixes #368 

The `--customBehaviors` flag is now an array, making it repeatable. This
should be backwards compatible with the CLI flag, but may require
changes to YAML configs when custom behaviors are used.

Custom behaviors can be loaded from URLs, local filepaths, and paths to
local directories, including any combination thereof.

New tests are added to ensure loading behaviors from URLs as well as a
mixed combination of URL and filepath works as expected.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2024-11-04 23:30:53 -05:00 committed by GitHub
parent e5bab8e7c8
commit 2a9b152531
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 142 additions and 40 deletions

View file

@ -50,6 +50,8 @@ RUN ln -s /app/dist/main.js /usr/bin/crawl; \
ln -s /app/dist/main.js /usr/bin/qa; \
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
RUN mkdir -p /app/behaviors
WORKDIR /crawls
# enable to test custom behaviors build (from browsertrix-behaviors)

View file

@ -246,9 +246,11 @@ Options:
ailOnFailedSeed may result in crawl
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors injects a custom behavior file or se
t of behavior files in a directory
[string]
--customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual
behavior files, or paths to a direct
ory of behavior files.
[array] [default: []]
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
ess (for debugging) [boolean]

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.3.4",
"version": "1.4.0-beta.0",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -39,7 +39,7 @@ import {
runWorkers,
} from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
import { Browser } from "./util/browser.js";
@ -511,7 +511,7 @@ export class Crawler {
}
if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors(
this.customBehaviors = await this.loadCustomBehaviors(
this.params.customBehaviors,
);
}
@ -801,10 +801,10 @@ self.__bx_behaviors.selectMainBehavior();
});
}
loadCustomBehaviors(filename: string) {
async loadCustomBehaviors(sources: string[]) {
let str = "";
for (const { contents } of collectAllFileSources(filename, ".js")) {
for (const { contents } of await collectCustomBehaviors(sources)) {
str += `self.__bx_behaviors.load(${contents});\n`;
}
@ -812,13 +812,13 @@ self.__bx_behaviors.selectMainBehavior();
}
async checkBehaviorScripts(cdp: CDPSession) {
const filename = this.params.customBehaviors;
const sources = this.params.customBehaviors;
if (!filename) {
if (!sources) {
return;
}
for (const { path, contents } of collectAllFileSources(filename, ".js")) {
for (const { path, contents } of await collectCustomBehaviors(sources)) {
await this.browser.checkScript(cdp, path, contents);
}
}

View file

@ -551,8 +551,10 @@ class ArgParser {
customBehaviors: {
describe:
"injects a custom behavior file or set of behavior files in a directory",
type: "string",
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
" to a directory of behavior files",
type: "array",
default: [],
},
debugAccessRedis: {

View file

@ -1,27 +1,83 @@
import fs from "fs";
import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";
import { logger } from "./logger.js";
const MAX_DEPTH = 2;
export function collectAllFileSources(
// Add .ts to allowed extensions when we can support it
const ALLOWED_EXTS = [".js"];
export type FileSource = {
path: string;
contents: string;
};
export type FileSources = FileSource[];
export async function collectCustomBehaviors(
sources: string[],
): Promise<FileSources> {
const collectedSources: FileSources = [];
for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
const newSources = await collectLocalPathBehaviors(fileSource);
collectedSources.push(...newSources);
}
}
return collectedSources;
}
async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;
try {
const res = await fetch(url);
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
logger.info(
"Custom behavior file downloaded",
{ url, path: behaviorFilepath },
"behavior",
);
return await collectLocalPathBehaviors(behaviorFilepath);
} catch (e) {
logger.error(
"Error downloading custom behavior from URL",
{ url, error: e },
"behavior",
);
}
return [];
}
async function collectLocalPathBehaviors(
fileOrDir: string,
ext?: string,
depth = 0,
): { path: string; contents: string }[] {
): Promise<FileSources> {
const resolvedPath = path.resolve(fileOrDir);
if (depth >= MAX_DEPTH) {
console.warn(
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
logger.warn(
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
{},
"behavior",
);
return [];
}
const stat = fs.statSync(resolvedPath);
const stat = await fsp.stat(resolvedPath);
if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
const contents = await fsp.readFile(resolvedPath);
return [
{
path: resolvedPath,
@ -30,24 +86,28 @@ export function collectAllFileSources(
];
}
if (stat.isDirectory()) {
const files = fs.readdirSync(resolvedPath);
return files.reduce(
(acc: { path: string; contents: string }[], next: string) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
},
[],
const behaviors: FileSources = [];
const isDir = stat.isDirectory();
if (!isDir && depth === 0) {
logger.warn(
"The provided path is not a .js file or directory",
{ path: resolvedPath },
"behavior",
);
}
if (depth === 0) {
console.warn(
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
);
if (isDir) {
const files = await fsp.readdir(resolvedPath);
for (const file of files) {
const filePath = path.join(resolvedPath, file);
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
behaviors.push(...newBehaviors);
}
}
return [];
return behaviors;
}
export async function getInfoString() {

View file

@ -1,16 +1,16 @@
import child_process from "child_process";
test("test custom behaviors", async () => {
test("test custom behaviors from local filepath", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
);
const log = res.toString();
// custom behavior ran for example.com
// custom behavior ran for specs.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
@ -35,6 +35,42 @@ test("test custom behaviors", async () => {
).toBe(true);
});
test("test custom behavior from URL", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page");
const log = res.toString();
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});
test("test mixed custom behavior sources", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page");
const log = res.toString();
// test custom behavior from url ran
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
// test custom behavior from local file ran
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});
test("test invalid behavior exit", async () => {
let status = 0;

View file

@ -11,7 +11,7 @@ class TestBehavior {
}
static isMatch() {
return window.location.origin === "https://example.com";
return window.location.origin === "https://specs.webrecorder.net";
}
async *run(ctx) {