2024-03-18 14:24:48 -07:00
|
|
|
import fsp from "fs/promises";
|
2023-07-06 16:09:48 -04:00
|
|
|
import path from "path";
|
2025-05-28 12:48:06 -07:00
|
|
|
import os from "os";
|
2024-11-04 23:30:53 -05:00
|
|
|
import crypto from "crypto";
|
|
|
|
|
import { fetch } from "undici";
|
2024-11-14 01:50:33 -05:00
|
|
|
import util from "util";
|
|
|
|
|
import { exec as execCallback } from "child_process";
|
2024-11-04 23:30:53 -05:00
|
|
|
|
2025-04-09 12:24:29 +02:00
|
|
|
import { formatErr, logger } from "./logger.js";
|
2025-01-25 22:55:49 -08:00
|
|
|
import { getProxyDispatcher } from "./proxy.js";
|
2025-04-09 12:24:29 +02:00
|
|
|
import { parseRecorderFlowJson } from "./flowbehavior.js";
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2024-11-14 01:50:33 -05:00
|
|
|
const exec = util.promisify(execCallback);
|
|
|
|
|
|
2025-04-01 21:15:57 -04:00
|
|
|
const MAX_DEPTH = 5;
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
// Add .ts to allowed extensions when we can support it
|
2025-04-09 12:24:29 +02:00
|
|
|
const ALLOWED_EXTS = [".js", ".json"];
|
2024-11-04 23:30:53 -05:00
|
|
|
|
|
|
|
|
export type FileSource = {
|
|
|
|
|
path: string;
|
|
|
|
|
contents: string;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
export type FileSources = FileSource[];
|
|
|
|
|
|
2025-07-03 10:49:37 -04:00
|
|
|
async function getTempFile(
|
|
|
|
|
filename: string,
|
|
|
|
|
dirPrefix: string,
|
|
|
|
|
): Promise<string> {
|
|
|
|
|
const tmpDir = path.join(
|
|
|
|
|
os.tmpdir(),
|
|
|
|
|
`${dirPrefix}-${crypto.randomBytes(4).toString("hex")}`,
|
|
|
|
|
);
|
|
|
|
|
await fsp.mkdir(tmpDir, { recursive: true });
|
|
|
|
|
return path.join(tmpDir, filename);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function writeUrlContentsToFile(
|
|
|
|
|
url: string,
|
|
|
|
|
pathPrefix: string,
|
|
|
|
|
pathDefaultExt: string,
|
|
|
|
|
) {
|
|
|
|
|
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
|
|
|
|
|
const fileContents = await res.text();
|
|
|
|
|
|
|
|
|
|
const filename =
|
|
|
|
|
path.basename(new URL(url).pathname) || "index." + pathDefaultExt;
|
|
|
|
|
const filepath = await getTempFile(filename, pathPrefix);
|
|
|
|
|
|
|
|
|
|
await fsp.writeFile(filepath, fileContents);
|
|
|
|
|
return filepath;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export async function collectOnlineSeedFile(url: string): Promise<string> {
|
|
|
|
|
try {
|
|
|
|
|
const filepath = await writeUrlContentsToFile(url, "seeds-", ".txt");
|
|
|
|
|
logger.info("Seed file downloaded", { url, path: filepath });
|
|
|
|
|
return filepath;
|
|
|
|
|
} catch (e) {
|
|
|
|
|
logger.fatal("Error downloading seed file from URL", {
|
|
|
|
|
url,
|
|
|
|
|
...formatErr(e),
|
|
|
|
|
});
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
export async function collectCustomBehaviors(
|
|
|
|
|
sources: string[],
|
|
|
|
|
): Promise<FileSources> {
|
|
|
|
|
const collectedSources: FileSources = [];
|
|
|
|
|
|
|
|
|
|
for (const fileSource of sources) {
|
2024-11-14 01:50:33 -05:00
|
|
|
if (fileSource.startsWith("git+")) {
|
|
|
|
|
const newSources = await collectGitBehaviors(fileSource);
|
|
|
|
|
collectedSources.push(...newSources);
|
|
|
|
|
} else if (fileSource.startsWith("http")) {
|
2024-11-04 23:30:53 -05:00
|
|
|
const newSources = await collectOnlineBehavior(fileSource);
|
|
|
|
|
collectedSources.push(...newSources);
|
|
|
|
|
} else {
|
|
|
|
|
const newSources = await collectLocalPathBehaviors(fileSource);
|
|
|
|
|
collectedSources.push(...newSources);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return collectedSources;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-14 01:50:33 -05:00
|
|
|
async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
|
|
|
|
const url = gitUrl.split("git+").pop() || "";
|
|
|
|
|
const params = new URL(url).searchParams;
|
|
|
|
|
const branch = params.get("branch") || "";
|
|
|
|
|
const relPath = params.get("path") || "";
|
|
|
|
|
const urlStripped = url.split("?")[0];
|
|
|
|
|
|
2025-05-28 12:48:06 -07:00
|
|
|
const tmpDir = path.join(
|
|
|
|
|
os.tmpdir(),
|
|
|
|
|
`behaviors-repo-${crypto.randomBytes(4).toString("hex")}`,
|
|
|
|
|
);
|
2024-11-14 01:50:33 -05:00
|
|
|
|
|
|
|
|
let cloneCommand = "git clone ";
|
|
|
|
|
if (branch) {
|
|
|
|
|
cloneCommand += `-b ${branch} --single-branch `;
|
|
|
|
|
}
|
|
|
|
|
cloneCommand += `${urlStripped} ${tmpDir}`;
|
|
|
|
|
|
|
|
|
|
let pathToCollect = tmpDir;
|
|
|
|
|
if (relPath) {
|
|
|
|
|
pathToCollect = path.join(tmpDir, relPath);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
await exec(cloneCommand);
|
|
|
|
|
logger.info(
|
|
|
|
|
"Custom behavior files downloaded from git repo",
|
|
|
|
|
{ url: urlStripped },
|
|
|
|
|
"behavior",
|
|
|
|
|
);
|
|
|
|
|
return await collectLocalPathBehaviors(pathToCollect);
|
|
|
|
|
} catch (e) {
|
2025-03-31 20:35:30 -04:00
|
|
|
logger.fatal(
|
2024-11-14 01:50:33 -05:00
|
|
|
"Error downloading custom behaviors from Git repo",
|
2025-07-03 10:49:37 -04:00
|
|
|
{ url: urlStripped, ...formatErr(e) },
|
2024-11-14 01:50:33 -05:00
|
|
|
"behavior",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
|
|
|
|
try {
|
2025-07-03 10:49:37 -04:00
|
|
|
const behaviorFilepath = await writeUrlContentsToFile(
|
|
|
|
|
url,
|
|
|
|
|
"behaviors-",
|
|
|
|
|
".js",
|
|
|
|
|
);
|
2024-11-04 23:30:53 -05:00
|
|
|
logger.info(
|
|
|
|
|
"Custom behavior file downloaded",
|
|
|
|
|
{ url, path: behaviorFilepath },
|
|
|
|
|
"behavior",
|
|
|
|
|
);
|
2025-04-01 21:15:57 -04:00
|
|
|
return await collectLocalPathBehaviors(behaviorFilepath, 0, url);
|
2024-11-04 23:30:53 -05:00
|
|
|
} catch (e) {
|
2025-03-31 20:35:30 -04:00
|
|
|
logger.fatal(
|
2024-11-04 23:30:53 -05:00
|
|
|
"Error downloading custom behavior from URL",
|
2025-07-03 10:49:37 -04:00
|
|
|
{ url, ...formatErr(e) },
|
2024-11-04 23:30:53 -05:00
|
|
|
"behavior",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function collectLocalPathBehaviors(
|
2023-11-09 19:11:11 -05:00
|
|
|
fileOrDir: string,
|
|
|
|
|
depth = 0,
|
2025-04-01 21:15:57 -04:00
|
|
|
source?: string,
|
2024-11-04 23:30:53 -05:00
|
|
|
): Promise<FileSources> {
|
2023-07-06 16:09:48 -04:00
|
|
|
const resolvedPath = path.resolve(fileOrDir);
|
2025-04-01 21:15:57 -04:00
|
|
|
const filename = path.basename(resolvedPath);
|
2023-07-06 16:09:48 -04:00
|
|
|
|
|
|
|
|
if (depth >= MAX_DEPTH) {
|
2024-11-04 23:30:53 -05:00
|
|
|
logger.warn(
|
|
|
|
|
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
|
|
|
|
{},
|
|
|
|
|
"behavior",
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-31 20:35:30 -04:00
|
|
|
const behaviors: FileSources = [];
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2025-03-31 20:35:30 -04:00
|
|
|
try {
|
|
|
|
|
const stat = await fsp.stat(resolvedPath);
|
|
|
|
|
|
|
|
|
|
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
|
2025-04-01 21:15:57 -04:00
|
|
|
source = source ?? filename;
|
|
|
|
|
logger.info("Custom behavior script added", { source }, "behavior");
|
2025-04-09 12:24:29 +02:00
|
|
|
let contents = await fsp.readFile(resolvedPath, { encoding: "utf-8" });
|
|
|
|
|
if (path.extname(resolvedPath) === ".json") {
|
|
|
|
|
try {
|
|
|
|
|
contents = parseRecorderFlowJson(contents, source);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
logger.fatal(
|
|
|
|
|
"Unable to parse recorder flow JSON, ignored",
|
|
|
|
|
formatErr(e),
|
|
|
|
|
"behavior",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-31 20:35:30 -04:00
|
|
|
return [
|
|
|
|
|
{
|
|
|
|
|
path: resolvedPath,
|
|
|
|
|
contents: `/* src: ${resolvedPath} */\n\n${contents}`,
|
|
|
|
|
},
|
|
|
|
|
];
|
|
|
|
|
}
|
2023-07-06 16:09:48 -04:00
|
|
|
|
2025-03-31 20:35:30 -04:00
|
|
|
const isDir = stat.isDirectory();
|
2024-11-04 23:30:53 -05:00
|
|
|
|
2025-04-01 21:15:57 -04:00
|
|
|
// ignore .git directory of git repositories
|
|
|
|
|
if (isDir && filename === ".git") {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-31 20:35:30 -04:00
|
|
|
if (!isDir && depth === 0) {
|
|
|
|
|
logger.warn(
|
|
|
|
|
"The provided path is not a .js file or directory",
|
|
|
|
|
{ path: resolvedPath },
|
|
|
|
|
"behavior",
|
|
|
|
|
);
|
|
|
|
|
}
|
2024-11-04 23:30:53 -05:00
|
|
|
|
2025-03-31 20:35:30 -04:00
|
|
|
if (isDir) {
|
|
|
|
|
const files = await fsp.readdir(resolvedPath);
|
|
|
|
|
for (const file of files) {
|
|
|
|
|
const filePath = path.join(resolvedPath, file);
|
|
|
|
|
const newBehaviors = await collectLocalPathBehaviors(
|
|
|
|
|
filePath,
|
|
|
|
|
depth + 1,
|
|
|
|
|
);
|
|
|
|
|
behaviors.push(...newBehaviors);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (e) {
|
|
|
|
|
logger.fatal(
|
|
|
|
|
"Error fetching local custom behaviors",
|
2025-07-03 10:49:37 -04:00
|
|
|
{ path: resolvedPath, ...formatErr(e) },
|
2024-11-04 23:30:53 -05:00
|
|
|
"behavior",
|
2023-12-13 12:14:53 -08:00
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
|
|
|
|
|
2025-03-31 20:35:30 -04:00
|
|
|
if (!behaviors && depth === 0) {
|
|
|
|
|
logger.fatal(
|
|
|
|
|
"No custom behaviors found at specified path",
|
|
|
|
|
{ path: resolvedPath },
|
|
|
|
|
"behavior",
|
|
|
|
|
);
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2024-11-04 23:30:53 -05:00
|
|
|
return behaviors;
|
2023-07-06 16:09:48 -04:00
|
|
|
}
|
2024-03-18 14:24:48 -07:00
|
|
|
|
|
|
|
|
export async function getInfoString() {
|
|
|
|
|
const packageFileJSON = JSON.parse(
|
|
|
|
|
await fsp.readFile(new URL("../../package.json", import.meta.url), {
|
|
|
|
|
encoding: "utf-8",
|
|
|
|
|
}),
|
|
|
|
|
);
|
|
|
|
|
const warcioPackageJSON = JSON.parse(
|
|
|
|
|
await fsp.readFile(
|
|
|
|
|
new URL("../../node_modules/warcio/package.json", import.meta.url),
|
|
|
|
|
{ encoding: "utf-8" },
|
|
|
|
|
),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
|
|
|
|
}
|