2024-03-18 14:24:48 -07:00
|
|
|
import fsp from "fs/promises";
|
2023-07-06 16:09:48 -04:00
|
|
|
import path from "path";
|
2024-11-04 23:30:53 -05:00
|
|
|
import crypto from "crypto";
|
|
|
|
import { fetch } from "undici";
|
2024-11-14 01:50:33 -05:00
|
|
|
import util from "util";
|
|
|
|
import { exec as execCallback } from "child_process";
|
2024-11-04 23:30:53 -05:00
|
|
|
|
|
|
|
import { logger } from "./logger.js";
|
2025-01-25 22:55:49 -08:00
|
|
|
import { getProxyDispatcher } from "./proxy.js";
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2024-11-14 01:50:33 -05:00
|
|
|
const exec = util.promisify(execCallback);
|
|
|
|
|
2023-07-06 16:09:48 -04:00
|
|
|
const MAX_DEPTH = 2;
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
// Add .ts to allowed extensions when we can support it
|
|
|
|
const ALLOWED_EXTS = [".js"];
|
|
|
|
|
|
|
|
export type FileSource = {
|
|
|
|
path: string;
|
|
|
|
contents: string;
|
|
|
|
};
|
|
|
|
|
|
|
|
export type FileSources = FileSource[];
|
|
|
|
|
|
|
|
export async function collectCustomBehaviors(
|
|
|
|
sources: string[],
|
|
|
|
): Promise<FileSources> {
|
|
|
|
const collectedSources: FileSources = [];
|
|
|
|
|
|
|
|
for (const fileSource of sources) {
|
2024-11-14 01:50:33 -05:00
|
|
|
if (fileSource.startsWith("git+")) {
|
|
|
|
const newSources = await collectGitBehaviors(fileSource);
|
|
|
|
collectedSources.push(...newSources);
|
|
|
|
} else if (fileSource.startsWith("http")) {
|
2024-11-04 23:30:53 -05:00
|
|
|
const newSources = await collectOnlineBehavior(fileSource);
|
|
|
|
collectedSources.push(...newSources);
|
|
|
|
} else {
|
|
|
|
const newSources = await collectLocalPathBehaviors(fileSource);
|
|
|
|
collectedSources.push(...newSources);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return collectedSources;
|
|
|
|
}
|
|
|
|
|
2024-11-14 01:50:33 -05:00
|
|
|
async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
|
|
|
const url = gitUrl.split("git+").pop() || "";
|
|
|
|
const params = new URL(url).searchParams;
|
|
|
|
const branch = params.get("branch") || "";
|
|
|
|
const relPath = params.get("path") || "";
|
|
|
|
const urlStripped = url.split("?")[0];
|
|
|
|
|
|
|
|
const tmpDir = `/tmp/behaviors-repo-${crypto.randomBytes(4).toString("hex")}`;
|
|
|
|
|
|
|
|
let cloneCommand = "git clone ";
|
|
|
|
if (branch) {
|
|
|
|
cloneCommand += `-b ${branch} --single-branch `;
|
|
|
|
}
|
|
|
|
cloneCommand += `${urlStripped} ${tmpDir}`;
|
|
|
|
|
|
|
|
let pathToCollect = tmpDir;
|
|
|
|
if (relPath) {
|
|
|
|
pathToCollect = path.join(tmpDir, relPath);
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
await exec(cloneCommand);
|
|
|
|
logger.info(
|
|
|
|
"Custom behavior files downloaded from git repo",
|
|
|
|
{ url: urlStripped },
|
|
|
|
"behavior",
|
|
|
|
);
|
|
|
|
return await collectLocalPathBehaviors(pathToCollect);
|
|
|
|
} catch (e) {
|
|
|
|
logger.error(
|
|
|
|
"Error downloading custom behaviors from Git repo",
|
|
|
|
{ url: urlStripped, error: e },
|
|
|
|
"behavior",
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
|
|
|
const filename = crypto.randomBytes(4).toString("hex") + ".js";
|
|
|
|
const behaviorFilepath = `/app/behaviors/${filename}`;
|
|
|
|
|
|
|
|
try {
|
2025-01-25 22:55:49 -08:00
|
|
|
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
|
2024-11-04 23:30:53 -05:00
|
|
|
const fileContents = await res.text();
|
|
|
|
await fsp.writeFile(behaviorFilepath, fileContents);
|
|
|
|
logger.info(
|
|
|
|
"Custom behavior file downloaded",
|
|
|
|
{ url, path: behaviorFilepath },
|
|
|
|
"behavior",
|
|
|
|
);
|
|
|
|
return await collectLocalPathBehaviors(behaviorFilepath);
|
|
|
|
} catch (e) {
|
|
|
|
logger.error(
|
|
|
|
"Error downloading custom behavior from URL",
|
|
|
|
{ url, error: e },
|
|
|
|
"behavior",
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
|
|
|
async function collectLocalPathBehaviors(
|
2023-11-09 19:11:11 -05:00
|
|
|
fileOrDir: string,
|
|
|
|
depth = 0,
|
2024-11-04 23:30:53 -05:00
|
|
|
): Promise<FileSources> {
|
2023-07-06 16:09:48 -04:00
|
|
|
const resolvedPath = path.resolve(fileOrDir);
|
|
|
|
|
|
|
|
if (depth >= MAX_DEPTH) {
|
2024-11-04 23:30:53 -05:00
|
|
|
logger.warn(
|
|
|
|
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
|
|
|
{},
|
|
|
|
"behavior",
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
const stat = await fsp.stat(resolvedPath);
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
|
|
|
|
const contents = await fsp.readFile(resolvedPath);
|
2023-12-13 12:14:53 -08:00
|
|
|
return [
|
|
|
|
{
|
|
|
|
path: resolvedPath,
|
|
|
|
contents: `/* src: ${resolvedPath} */\n\n${contents}`,
|
|
|
|
},
|
|
|
|
];
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
const behaviors: FileSources = [];
|
|
|
|
|
|
|
|
const isDir = stat.isDirectory();
|
|
|
|
|
|
|
|
if (!isDir && depth === 0) {
|
|
|
|
logger.warn(
|
|
|
|
"The provided path is not a .js file or directory",
|
|
|
|
{ path: resolvedPath },
|
|
|
|
"behavior",
|
2023-12-13 12:14:53 -08:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
if (isDir) {
|
|
|
|
const files = await fsp.readdir(resolvedPath);
|
|
|
|
for (const file of files) {
|
|
|
|
const filePath = path.join(resolvedPath, file);
|
|
|
|
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
|
|
|
|
behaviors.push(...newBehaviors);
|
|
|
|
}
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
return behaviors;
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
2024-03-18 14:24:48 -07:00
|
|
|
|
|
|
|
export async function getInfoString() {
|
|
|
|
const packageFileJSON = JSON.parse(
|
|
|
|
await fsp.readFile(new URL("../../package.json", import.meta.url), {
|
|
|
|
encoding: "utf-8",
|
|
|
|
}),
|
|
|
|
);
|
|
|
|
const warcioPackageJSON = JSON.parse(
|
|
|
|
await fsp.readFile(
|
|
|
|
new URL("../../node_modules/warcio/package.json", import.meta.url),
|
|
|
|
{ encoding: "utf-8" },
|
|
|
|
),
|
|
|
|
);
|
|
|
|
|
|
|
|
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
|
|
|
}
|