Fail crawl with fatal message if custom behavior isn't loaded (#799)

Fixes #797 

The crawler will now exit with a fatal log message and exit code 17 if:

- A Git repository specified with `--customBehavior` cannot be cloned
successfully (new)
- A custom behavior file at a URL specified with `--customBehavior` is
not fetched successfully (new)
- No custom behaviors are collected at a local filepath specified with
`--customBehavior`, or if an error is thrown while attempting to collect
files from a nonexistent path (new)
- Any custom behaviors collected fail `Browser.checkScript` validation
(existing behavior)

Tests have also been added accordingly.
This commit is contained in:
Tessa Walsh 2025-03-31 20:35:30 -04:00 committed by GitHub
parent e751929a7a
commit 5fedde6eee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 90 additions and 26 deletions

View file

@ -72,7 +72,7 @@ async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
);
return await collectLocalPathBehaviors(pathToCollect);
} catch (e) {
logger.error(
logger.fatal(
"Error downloading custom behaviors from Git repo",
{ url: urlStripped, error: e },
"behavior",
@ -96,7 +96,7 @@ async function collectOnlineBehavior(url: string): Promise<FileSources> {
);
return await collectLocalPathBehaviors(behaviorFilepath);
} catch (e) {
logger.error(
logger.fatal(
"Error downloading custom behavior from URL",
{ url, error: e },
"behavior",
@ -120,37 +120,56 @@ async function collectLocalPathBehaviors(
return [];
}
const stat = await fsp.stat(resolvedPath);
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
const contents = await fsp.readFile(resolvedPath);
return [
{
path: resolvedPath,
contents: `/* src: ${resolvedPath} */\n\n${contents}`,
},
];
}
const behaviors: FileSources = [];
const isDir = stat.isDirectory();
try {
const stat = await fsp.stat(resolvedPath);
if (!isDir && depth === 0) {
logger.warn(
"The provided path is not a .js file or directory",
{ path: resolvedPath },
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
const contents = await fsp.readFile(resolvedPath);
return [
{
path: resolvedPath,
contents: `/* src: ${resolvedPath} */\n\n${contents}`,
},
];
}
const isDir = stat.isDirectory();
if (!isDir && depth === 0) {
logger.warn(
"The provided path is not a .js file or directory",
{ path: resolvedPath },
"behavior",
);
}
if (isDir) {
const files = await fsp.readdir(resolvedPath);
for (const file of files) {
const filePath = path.join(resolvedPath, file);
const newBehaviors = await collectLocalPathBehaviors(
filePath,
depth + 1,
);
behaviors.push(...newBehaviors);
}
}
} catch (e) {
logger.fatal(
"Error fetching local custom behaviors",
{ path: resolvedPath, error: e },
"behavior",
);
}
if (isDir) {
const files = await fsp.readdir(resolvedPath);
for (const file of files) {
const filePath = path.join(resolvedPath, file);
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
behaviors.push(...newBehaviors);
}
if (!behaviors && depth === 0) {
logger.fatal(
"No custom behaviors found at specified path",
{ path: resolvedPath },
"behavior",
);
}
return behaviors;

View file

@ -120,3 +120,48 @@ test("test invalid behavior exit", async () => {
// logger fatal exit code
expect(status).toBe(17);
});
test("test crawl exits if behavior not fetched from url", async () => {
let status = 0;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors https://webrecorder.net/doesntexist/custombehavior.js --scopeType page",
);
} catch (e) {
status = e.status;
}
// logger fatal exit code
expect(status).toBe(17);
});
test("test crawl exits if behavior not fetched from git repo", async () => {
let status = 0;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors git+https://github.com/webrecorder/doesntexist --scopeType page",
);
} catch (e) {
status = e.status;
}
// logger fatal exit code
expect(status).toBe(17);
});
test("test crawl exits if not custom behaviors collected from local path", async () => {
let status = 0;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors /custom-behaviors/doesntexist --scopeType page",
);
} catch (e) {
status = e.status;
}
// logger fatal exit code
expect(status).toBe(17);
});