Support loading custom behaviors from git repo (#717)

Fixes #712 
- Also expands the existing documentation about behaviors and adds a test.
- Uses query arg for 'branch' and 'path' to specify git branch and subpath in repo, respectively.

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
Tessa Walsh 2024-11-14 01:50:33 -05:00 committed by GitHub
parent ea05307528
commit 60c84b342e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 121 additions and 10 deletions

View file

@ -14,12 +14,41 @@ To disable behaviors for a crawl, use `--behaviors ""`.
## Additional Custom Behaviors
Custom behaviors can be mounted into the crawler and loaded from there. For example:
```sh
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --customBehaviors /custom-behaviors/
```
This will load all the custom behaviors stored in the `tests/custom-behaviors` directory. The first behavior which returns true for `isMatch()` will be run on a given page.
Custom behaviors can be mounted into the crawler and ran from there, or downloaded from a URL.
Each behavior should contain a single class that implements the behavior interface. See [the behaviors tutorial](https://github.com/webrecorder/browsertrix-behaviors/blob/main/docs/TUTORIAL.md) for more info on how to write behaviors.
The first behavior which returns true for `isMatch()` will be run on a given page.
The repeatable `--customBehaviors` flag can accept:
- A path to a directory of behavior files
- A path to a single behavior file
- A URL for a single behavior file to download
- A URL for a git repository of the form `git+https://git.example.com/repo.git`, with optional query parameters `branch` (to specify a particular branch to use) and `path` (to specify a relative path to a directory within the git repository where the custom behaviors are located)
### Examples
#### Local filepath (directory)
```sh
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors /custom-behaviors/
```
#### Local filepath (file)
```sh
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors /custom-behaviors/custom.js
```
#### URL
```sh
docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors https://example.com/custom-behavior-1 --customBehaviors https://example.org/custom-behavior-2
```
#### Git repository
```sh
docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --customBehaviors "git+https://git.example.com/custom-behaviors?branch=dev&path=path/to/behaviors"
```

View file

@ -564,8 +564,10 @@ class ArgParser {
customBehaviors: {
describe:
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
" to a directory of behavior files",
"Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" +
" of behaviors, URL to Git repo of behaviors (prefixed with git+, optionally specify branch and" +
" relative path to a directory within repo as branch and path query parameters, e.g." +
' --customBehaviors "git+https://git.example.com/repo.git?branch=dev&path=some/dir"',
type: "array",
default: [],
},

View file

@ -2,9 +2,13 @@ import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";
import util from "util";
import { exec as execCallback } from "child_process";
import { logger } from "./logger.js";
const exec = util.promisify(execCallback);
const MAX_DEPTH = 2;
// Add .ts to allowed extensions when we can support it
@ -23,7 +27,10 @@ export async function collectCustomBehaviors(
const collectedSources: FileSources = [];
for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
if (fileSource.startsWith("git+")) {
const newSources = await collectGitBehaviors(fileSource);
collectedSources.push(...newSources);
} else if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
@ -35,6 +42,44 @@ export async function collectCustomBehaviors(
return collectedSources;
}
async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
const url = gitUrl.split("git+").pop() || "";
const params = new URL(url).searchParams;
const branch = params.get("branch") || "";
const relPath = params.get("path") || "";
const urlStripped = url.split("?")[0];
const tmpDir = `/tmp/behaviors-repo-${crypto.randomBytes(4).toString("hex")}`;
let cloneCommand = "git clone ";
if (branch) {
cloneCommand += `-b ${branch} --single-branch `;
}
cloneCommand += `${urlStripped} ${tmpDir}`;
let pathToCollect = tmpDir;
if (relPath) {
pathToCollect = path.join(tmpDir, relPath);
}
try {
await exec(cloneCommand);
logger.info(
"Custom behavior files downloaded from git repo",
{ url: urlStripped },
"behavior",
);
return await collectLocalPathBehaviors(pathToCollect);
} catch (e) {
logger.error(
"Error downloading custom behaviors from Git repo",
{ url: urlStripped, error: e },
"behavior",
);
}
return [];
}
async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;

View file

@ -71,6 +71,41 @@ test("test mixed custom behavior sources", async () => {
).toBe(true);
});
test("test custom behaviors from git repo", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page",
);
const log = res.toString();
// custom behavior ran for specs.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
// but not for example.org
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(false);
expect(
log.indexOf(
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(true);
// another custom behavior ran for old.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});
test("test invalid behavior exit", async () => {
let status = 0;