mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Support loading custom behaviors from git repo (#717)
Fixes #712 - Also expands the existing documentation about behaviors and adds a test. - Uses query arg for 'branch' and 'path' to specify git branch and subpath in repo, respectively. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
ea05307528
commit
60c84b342e
4 changed files with 121 additions and 10 deletions
|
@ -14,12 +14,41 @@ To disable behaviors for a crawl, use `--behaviors ""`.
|
|||
|
||||
## Additional Custom Behaviors
|
||||
|
||||
Custom behaviors can be mounted into the crawler and loaded from there. For example:
|
||||
|
||||
```sh
|
||||
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --customBehaviors /custom-behaviors/
|
||||
```
|
||||
|
||||
This will load all the custom behaviors stored in the `tests/custom-behaviors` directory. The first behavior which returns true for `isMatch()` will be run on a given page.
|
||||
Custom behaviors can be mounted into the crawler and ran from there, or downloaded from a URL.
|
||||
|
||||
Each behavior should contain a single class that implements the behavior interface. See [the behaviors tutorial](https://github.com/webrecorder/browsertrix-behaviors/blob/main/docs/TUTORIAL.md) for more info on how to write behaviors.
|
||||
|
||||
The first behavior which returns true for `isMatch()` will be run on a given page.
|
||||
|
||||
The repeatable `--customBehaviors` flag can accept:
|
||||
|
||||
- A path to a directory of behavior files
|
||||
- A path to a single behavior file
|
||||
- A URL for a single behavior file to download
|
||||
- A URL for a git repository of the form `git+https://git.example.com/repo.git`, with optional query parameters `branch` (to specify a particular branch to use) and `path` (to specify a relative path to a directory within the git repository where the custom behaviors are located)
|
||||
|
||||
### Examples
|
||||
|
||||
#### Local filepath (directory)
|
||||
|
||||
```sh
|
||||
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors /custom-behaviors/
|
||||
```
|
||||
|
||||
#### Local filepath (file)
|
||||
|
||||
```sh
|
||||
docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors /custom-behaviors/custom.js
|
||||
```
|
||||
|
||||
#### URL
|
||||
|
||||
```sh
|
||||
docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --customBehaviors https://example.com/custom-behavior-1 --customBehaviors https://example.org/custom-behavior-2
|
||||
```
|
||||
|
||||
#### Git repository
|
||||
|
||||
```sh
|
||||
docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --customBehaviors "git+https://git.example.com/custom-behaviors?branch=dev&path=path/to/behaviors"
|
||||
```
|
||||
|
|
|
@ -564,8 +564,10 @@ class ArgParser {
|
|||
|
||||
customBehaviors: {
|
||||
describe:
|
||||
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
|
||||
" to a directory of behavior files",
|
||||
"Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" +
|
||||
" of behaviors, URL to Git repo of behaviors (prefixed with git+, optionally specify branch and" +
|
||||
" relative path to a directory within repo as branch and path query parameters, e.g." +
|
||||
' --customBehaviors "git+https://git.example.com/repo.git?branch=dev&path=some/dir"',
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
|
|
@ -2,9 +2,13 @@ import fsp from "fs/promises";
|
|||
import path from "path";
|
||||
import crypto from "crypto";
|
||||
import { fetch } from "undici";
|
||||
import util from "util";
|
||||
import { exec as execCallback } from "child_process";
|
||||
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
const MAX_DEPTH = 2;
|
||||
|
||||
// Add .ts to allowed extensions when we can support it
|
||||
|
@ -23,7 +27,10 @@ export async function collectCustomBehaviors(
|
|||
const collectedSources: FileSources = [];
|
||||
|
||||
for (const fileSource of sources) {
|
||||
if (fileSource.startsWith("http")) {
|
||||
if (fileSource.startsWith("git+")) {
|
||||
const newSources = await collectGitBehaviors(fileSource);
|
||||
collectedSources.push(...newSources);
|
||||
} else if (fileSource.startsWith("http")) {
|
||||
const newSources = await collectOnlineBehavior(fileSource);
|
||||
collectedSources.push(...newSources);
|
||||
} else {
|
||||
|
@ -35,6 +42,44 @@ export async function collectCustomBehaviors(
|
|||
return collectedSources;
|
||||
}
|
||||
|
||||
async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
||||
const url = gitUrl.split("git+").pop() || "";
|
||||
const params = new URL(url).searchParams;
|
||||
const branch = params.get("branch") || "";
|
||||
const relPath = params.get("path") || "";
|
||||
const urlStripped = url.split("?")[0];
|
||||
|
||||
const tmpDir = `/tmp/behaviors-repo-${crypto.randomBytes(4).toString("hex")}`;
|
||||
|
||||
let cloneCommand = "git clone ";
|
||||
if (branch) {
|
||||
cloneCommand += `-b ${branch} --single-branch `;
|
||||
}
|
||||
cloneCommand += `${urlStripped} ${tmpDir}`;
|
||||
|
||||
let pathToCollect = tmpDir;
|
||||
if (relPath) {
|
||||
pathToCollect = path.join(tmpDir, relPath);
|
||||
}
|
||||
|
||||
try {
|
||||
await exec(cloneCommand);
|
||||
logger.info(
|
||||
"Custom behavior files downloaded from git repo",
|
||||
{ url: urlStripped },
|
||||
"behavior",
|
||||
);
|
||||
return await collectLocalPathBehaviors(pathToCollect);
|
||||
} catch (e) {
|
||||
logger.error(
|
||||
"Error downloading custom behaviors from Git repo",
|
||||
{ url: urlStripped, error: e },
|
||||
"behavior",
|
||||
);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
||||
const filename = crypto.randomBytes(4).toString("hex") + ".js";
|
||||
const behaviorFilepath = `/app/behaviors/${filename}`;
|
||||
|
|
|
@ -71,6 +71,41 @@ test("test mixed custom behavior sources", async () => {
|
|||
).toBe(true);
|
||||
});
|
||||
|
||||
test("test custom behaviors from git repo", async () => {
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// custom behavior ran for specs.webrecorder.net
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// but not for example.org
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(false);
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// another custom behavior ran for old.webrecorder.net
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("test invalid behavior exit", async () => {
|
||||
let status = 0;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue