mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Support loading custom behaviors from URLs and/or filepaths (#707)
Fixes #368 The `--customBehaviors` flag is now an array, making it repeatable. This should be backwards compatible with the CLI flag, but may require changes to YAML configs when custom behaviors are used. Custom behaviors can be loaded from URLs, local filepaths, and paths to local directories, including any combination thereof. New tests are added to ensure loading behaviors from URLs as well as a mixed combination of URL and filepath works as expected. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
e5bab8e7c8
commit
2a9b152531
8 changed files with 142 additions and 40 deletions
|
@ -50,6 +50,8 @@ RUN ln -s /app/dist/main.js /usr/bin/crawl; \
|
||||||
ln -s /app/dist/main.js /usr/bin/qa; \
|
ln -s /app/dist/main.js /usr/bin/qa; \
|
||||||
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
|
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
|
||||||
|
|
||||||
|
RUN mkdir -p /app/behaviors
|
||||||
|
|
||||||
WORKDIR /crawls
|
WORKDIR /crawls
|
||||||
|
|
||||||
# enable to test custom behaviors build (from browsertrix-behaviors)
|
# enable to test custom behaviors build (from browsertrix-behaviors)
|
||||||
|
|
|
@ -246,9 +246,11 @@ Options:
|
||||||
ailOnFailedSeed may result in crawl
|
ailOnFailedSeed may result in crawl
|
||||||
failing due to non-200 responses
|
failing due to non-200 responses
|
||||||
[boolean] [default: false]
|
[boolean] [default: false]
|
||||||
--customBehaviors injects a custom behavior file or se
|
--customBehaviors Custom behavior files to inject. Val
|
||||||
t of behavior files in a directory
|
ues can be URLs, paths to individual
|
||||||
[string]
|
behavior files, or paths to a direct
|
||||||
|
ory of behavior files.
|
||||||
|
[array] [default: []]
|
||||||
--debugAccessRedis if set, runs internal redis without
|
--debugAccessRedis if set, runs internal redis without
|
||||||
protected mode to allow external acc
|
protected mode to allow external acc
|
||||||
ess (for debugging) [boolean]
|
ess (for debugging) [boolean]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.3.4",
|
"version": "1.4.0-beta.0",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -39,7 +39,7 @@ import {
|
||||||
runWorkers,
|
runWorkers,
|
||||||
} from "./util/worker.js";
|
} from "./util/worker.js";
|
||||||
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
||||||
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
|
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
|
||||||
|
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
|
|
||||||
|
@ -511,7 +511,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.customBehaviors) {
|
if (this.params.customBehaviors) {
|
||||||
this.customBehaviors = this.loadCustomBehaviors(
|
this.customBehaviors = await this.loadCustomBehaviors(
|
||||||
this.params.customBehaviors,
|
this.params.customBehaviors,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -801,10 +801,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
loadCustomBehaviors(filename: string) {
|
async loadCustomBehaviors(sources: string[]) {
|
||||||
let str = "";
|
let str = "";
|
||||||
|
|
||||||
for (const { contents } of collectAllFileSources(filename, ".js")) {
|
for (const { contents } of await collectCustomBehaviors(sources)) {
|
||||||
str += `self.__bx_behaviors.load(${contents});\n`;
|
str += `self.__bx_behaviors.load(${contents});\n`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -812,13 +812,13 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
|
|
||||||
async checkBehaviorScripts(cdp: CDPSession) {
|
async checkBehaviorScripts(cdp: CDPSession) {
|
||||||
const filename = this.params.customBehaviors;
|
const sources = this.params.customBehaviors;
|
||||||
|
|
||||||
if (!filename) {
|
if (!sources) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const { path, contents } of collectAllFileSources(filename, ".js")) {
|
for (const { path, contents } of await collectCustomBehaviors(sources)) {
|
||||||
await this.browser.checkScript(cdp, path, contents);
|
await this.browser.checkScript(cdp, path, contents);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -551,8 +551,10 @@ class ArgParser {
|
||||||
|
|
||||||
customBehaviors: {
|
customBehaviors: {
|
||||||
describe:
|
describe:
|
||||||
"injects a custom behavior file or set of behavior files in a directory",
|
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
|
||||||
type: "string",
|
" to a directory of behavior files",
|
||||||
|
type: "array",
|
||||||
|
default: [],
|
||||||
},
|
},
|
||||||
|
|
||||||
debugAccessRedis: {
|
debugAccessRedis: {
|
||||||
|
|
|
@ -1,27 +1,83 @@
|
||||||
import fs from "fs";
|
|
||||||
import fsp from "fs/promises";
|
import fsp from "fs/promises";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
|
import crypto from "crypto";
|
||||||
|
import { fetch } from "undici";
|
||||||
|
|
||||||
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const MAX_DEPTH = 2;
|
const MAX_DEPTH = 2;
|
||||||
|
|
||||||
export function collectAllFileSources(
|
// Add .ts to allowed extensions when we can support it
|
||||||
|
const ALLOWED_EXTS = [".js"];
|
||||||
|
|
||||||
|
export type FileSource = {
|
||||||
|
path: string;
|
||||||
|
contents: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type FileSources = FileSource[];
|
||||||
|
|
||||||
|
export async function collectCustomBehaviors(
|
||||||
|
sources: string[],
|
||||||
|
): Promise<FileSources> {
|
||||||
|
const collectedSources: FileSources = [];
|
||||||
|
|
||||||
|
for (const fileSource of sources) {
|
||||||
|
if (fileSource.startsWith("http")) {
|
||||||
|
const newSources = await collectOnlineBehavior(fileSource);
|
||||||
|
collectedSources.push(...newSources);
|
||||||
|
} else {
|
||||||
|
const newSources = await collectLocalPathBehaviors(fileSource);
|
||||||
|
collectedSources.push(...newSources);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return collectedSources;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
||||||
|
const filename = crypto.randomBytes(4).toString("hex") + ".js";
|
||||||
|
const behaviorFilepath = `/app/behaviors/${filename}`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(url);
|
||||||
|
const fileContents = await res.text();
|
||||||
|
await fsp.writeFile(behaviorFilepath, fileContents);
|
||||||
|
logger.info(
|
||||||
|
"Custom behavior file downloaded",
|
||||||
|
{ url, path: behaviorFilepath },
|
||||||
|
"behavior",
|
||||||
|
);
|
||||||
|
return await collectLocalPathBehaviors(behaviorFilepath);
|
||||||
|
} catch (e) {
|
||||||
|
logger.error(
|
||||||
|
"Error downloading custom behavior from URL",
|
||||||
|
{ url, error: e },
|
||||||
|
"behavior",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
async function collectLocalPathBehaviors(
|
||||||
fileOrDir: string,
|
fileOrDir: string,
|
||||||
ext?: string,
|
|
||||||
depth = 0,
|
depth = 0,
|
||||||
): { path: string; contents: string }[] {
|
): Promise<FileSources> {
|
||||||
const resolvedPath = path.resolve(fileOrDir);
|
const resolvedPath = path.resolve(fileOrDir);
|
||||||
|
|
||||||
if (depth >= MAX_DEPTH) {
|
if (depth >= MAX_DEPTH) {
|
||||||
console.warn(
|
logger.warn(
|
||||||
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
||||||
|
{},
|
||||||
|
"behavior",
|
||||||
);
|
);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
const stat = fs.statSync(resolvedPath);
|
const stat = await fsp.stat(resolvedPath);
|
||||||
|
|
||||||
if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
|
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
|
||||||
const contents = fs.readFileSync(resolvedPath);
|
const contents = await fsp.readFile(resolvedPath);
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
path: resolvedPath,
|
path: resolvedPath,
|
||||||
|
@ -30,24 +86,28 @@ export function collectAllFileSources(
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stat.isDirectory()) {
|
const behaviors: FileSources = [];
|
||||||
const files = fs.readdirSync(resolvedPath);
|
|
||||||
return files.reduce(
|
const isDir = stat.isDirectory();
|
||||||
(acc: { path: string; contents: string }[], next: string) => {
|
|
||||||
const nextPath = path.join(fileOrDir, next);
|
if (!isDir && depth === 0) {
|
||||||
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
|
logger.warn(
|
||||||
},
|
"The provided path is not a .js file or directory",
|
||||||
[],
|
{ path: resolvedPath },
|
||||||
|
"behavior",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (depth === 0) {
|
if (isDir) {
|
||||||
console.warn(
|
const files = await fsp.readdir(resolvedPath);
|
||||||
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
|
for (const file of files) {
|
||||||
);
|
const filePath = path.join(resolvedPath, file);
|
||||||
|
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
|
||||||
|
behaviors.push(...newBehaviors);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return [];
|
return behaviors;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getInfoString() {
|
export async function getInfoString() {
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
|
||||||
test("test custom behaviors", async () => {
|
test("test custom behaviors from local filepath", async () => {
|
||||||
const res = child_process.execSync(
|
const res = child_process.execSync(
|
||||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||||
);
|
);
|
||||||
|
|
||||||
const log = res.toString();
|
const log = res.toString();
|
||||||
|
|
||||||
// custom behavior ran for example.com
|
// custom behavior ran for specs.webrecorder.net
|
||||||
expect(
|
expect(
|
||||||
log.indexOf(
|
log.indexOf(
|
||||||
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
|
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||||
) > 0,
|
) > 0,
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
|
|
||||||
|
@ -35,6 +35,42 @@ test("test custom behaviors", async () => {
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("test custom behavior from URL", async () => {
|
||||||
|
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page");
|
||||||
|
|
||||||
|
const log = res.toString();
|
||||||
|
|
||||||
|
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
|
||||||
|
|
||||||
|
expect(
|
||||||
|
log.indexOf(
|
||||||
|
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||||
|
) > 0,
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("test mixed custom behavior sources", async () => {
|
||||||
|
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page");
|
||||||
|
|
||||||
|
const log = res.toString();
|
||||||
|
|
||||||
|
// test custom behavior from url ran
|
||||||
|
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
|
||||||
|
|
||||||
|
expect(
|
||||||
|
log.indexOf(
|
||||||
|
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||||
|
) > 0,
|
||||||
|
).toBe(true);
|
||||||
|
|
||||||
|
// test custom behavior from local file ran
|
||||||
|
expect(
|
||||||
|
log.indexOf(
|
||||||
|
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||||
|
) > 0,
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
test("test invalid behavior exit", async () => {
|
test("test invalid behavior exit", async () => {
|
||||||
let status = 0;
|
let status = 0;
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ class TestBehavior {
|
||||||
}
|
}
|
||||||
|
|
||||||
static isMatch() {
|
static isMatch() {
|
||||||
return window.location.origin === "https://example.com";
|
return window.location.origin === "https://specs.webrecorder.net";
|
||||||
}
|
}
|
||||||
|
|
||||||
async *run(ctx) {
|
async *run(ctx) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue