Support downloading seed file from URL (#852)

Fixes #841 

Crawler work toward long URL lists in Browsertrix. This PR moves seed
handling from the arg parser's validation step to the crawler's
bootstrap step in order to be able to async fetch the seed file from a
URL.

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2025-07-03 10:49:37 -04:00 committed by GitHub
parent 687f08b1d0
commit 2af94ffab5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 189 additions and 103 deletions

View file

@ -62,7 +62,7 @@ import {
} from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
import { ScopedSeed, parseSeeds } from "./util/seeds.js";
import {
WARCWriter,
createWARCInfo,
@ -134,7 +134,7 @@ export class Crawler {
maxPageTime: number;
seeds: ScopedSeed[];
seeds: ScopedSeed[] = [];
numOriginalSeeds = 0;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -255,9 +255,6 @@ export class Crawler {
this.saveStateFiles = [];
this.lastSaveTime = 0;
this.seeds = this.params.scopedSeeds as ScopedSeed[];
this.numOriginalSeeds = this.seeds.length;
// sum of page load + behavior timeouts + 2 x pageop timeouts (for cloudflare, link extraction) + extra page delay
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
this.maxPageTime =
@ -514,6 +511,9 @@ export class Crawler {
this.proxyServer = await initProxy(this.params, RUN_DETACHED);
this.seeds = await parseSeeds(this.params);
this.numOriginalSeeds = this.seeds.length;
logger.info("Seeds", this.seeds);
logger.info("Link Selectors", this.params.selectLinks);

View file

@ -96,8 +96,6 @@ export class ReplayCrawler extends Crawler {
// skip text from first two frames, as they are RWP boilerplate
this.skipTextDocs = SKIP_FRAMES;
this.params.scopedSeeds = [];
this.params.screenshot = ["view"];
this.params.text = ["to-warc"];

View file

@ -20,7 +20,6 @@ import {
BxFunctionBindings,
DEFAULT_CRAWL_ID_TEMPLATE,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js";
import {
@ -37,12 +36,14 @@ export type CrawlerArgs = ReturnType<typeof parseArgs> & {
logExcludeContext: LogContext[];
text: string[];
scopedSeeds: ScopedSeed[];
customBehaviors: string[];
selectLinks: ExtractSelector[];
include: string[];
exclude: string[];
sitemap: boolean;
crawlId: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -776,22 +777,6 @@ class ArgParser {
}
}
if (argv.seedFile) {
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");
if (typeof argv.seeds === "string") {
argv.seeds = [argv.seeds];
}
for (const seed of urlSeedFileList) {
if (seed) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(argv.seeds as any).push(seed);
}
}
}
let selectLinks: ExtractSelector[];
if (argv.selectLinks) {
@ -823,50 +808,10 @@ class ArgParser {
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
}
const scopedSeeds: ScopedSeed[] = [];
if (!isQA) {
const scopeOpts = {
scopeType: argv.scopeType,
sitemap: argv.sitemap,
include: argv.include,
exclude: argv.exclude,
depth: argv.depth,
extraHops: argv.extraHops,
};
for (const seed of argv.seeds) {
const newSeed = typeof seed === "string" ? { url: seed } : seed;
try {
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Failed to create seed", {
error: e.toString(),
...scopeOpts,
...newSeed,
});
if (argv.failOnFailedSeed) {
logger.fatal(
"Invalid seed specified, aborting crawl",
{ url: newSeed.url },
"general",
1,
);
}
}
}
if (!scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl");
}
} else if (!argv.qaSource) {
if (isQA && !argv.qaSource) {
logger.fatal("--qaSource required for QA mode");
}
argv.scopedSeeds = scopedSeeds;
// Resolve statsFilename
if (argv.statsFilename) {
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);

View file

@ -24,6 +24,48 @@ export type FileSource = {
export type FileSources = FileSource[];
async function getTempFile(
filename: string,
dirPrefix: string,
): Promise<string> {
const tmpDir = path.join(
os.tmpdir(),
`${dirPrefix}-${crypto.randomBytes(4).toString("hex")}`,
);
await fsp.mkdir(tmpDir, { recursive: true });
return path.join(tmpDir, filename);
}
async function writeUrlContentsToFile(
url: string,
pathPrefix: string,
pathDefaultExt: string,
) {
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
const fileContents = await res.text();
const filename =
path.basename(new URL(url).pathname) || "index." + pathDefaultExt;
const filepath = await getTempFile(filename, pathPrefix);
await fsp.writeFile(filepath, fileContents);
return filepath;
}
export async function collectOnlineSeedFile(url: string): Promise<string> {
try {
const filepath = await writeUrlContentsToFile(url, "seeds-", ".txt");
logger.info("Seed file downloaded", { url, path: filepath });
return filepath;
} catch (e) {
logger.fatal("Error downloading seed file from URL", {
url,
...formatErr(e),
});
throw e;
}
}
export async function collectCustomBehaviors(
sources: string[],
): Promise<FileSources> {
@ -79,7 +121,7 @@ async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
} catch (e) {
logger.fatal(
"Error downloading custom behaviors from Git repo",
{ url: urlStripped, error: e },
{ url: urlStripped, ...formatErr(e) },
"behavior",
);
}
@ -87,18 +129,12 @@ async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
}
async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = path.basename(new URL(url).pathname);
const tmpDir = path.join(
os.tmpdir(),
`behaviors-${crypto.randomBytes(4).toString("hex")}`,
);
await fsp.mkdir(tmpDir, { recursive: true });
const behaviorFilepath = path.join(tmpDir, filename);
try {
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
const behaviorFilepath = await writeUrlContentsToFile(
url,
"behaviors-",
".js",
);
logger.info(
"Custom behavior file downloaded",
{ url, path: behaviorFilepath },
@ -108,7 +144,7 @@ async function collectOnlineBehavior(url: string): Promise<FileSources> {
} catch (e) {
logger.fatal(
"Error downloading custom behavior from URL",
{ url, error: e },
{ url, ...formatErr(e) },
"behavior",
);
}
@ -190,7 +226,7 @@ async function collectLocalPathBehaviors(
} catch (e) {
logger.fatal(
"Error fetching local custom behaviors",
{ path: resolvedPath, error: e },
{ path: resolvedPath, ...formatErr(e) },
"behavior",
);
}

View file

@ -1,5 +1,9 @@
import { logger } from "./logger.js";
import fs from "fs";
import { MAX_DEPTH } from "./constants.js";
import { collectOnlineSeedFile } from "./file_reader.js";
import { logger } from "./logger.js";
import { type CrawlerArgs } from "./argParser.js";
type ScopeType =
| "prefix"
@ -39,14 +43,14 @@ export class ScopedSeed {
auth = null,
}: {
url: string;
scopeType: ScopeType;
scopeType: ScopeType | undefined;
include: string[];
exclude: string[];
allowHash?: boolean;
depth?: number;
sitemap?: string | boolean | null;
extraHops?: number;
auth: string | null;
auth?: string | null;
}) {
const parsedUrl = this.parseUrl(url);
if (!parsedUrl) {
@ -62,14 +66,14 @@ export class ScopedSeed {
this.url = parsedUrl.href;
this.include = parseRx(include);
this.exclude = parseRx(exclude);
this.scopeType = scopeType;
this._includeStr = include;
this._excludeStr = exclude;
if (!this.scopeType) {
this.scopeType = this.include.length ? "custom" : "prefix";
if (!scopeType) {
scopeType = this.include.length ? "custom" : "prefix";
}
this.scopeType = scopeType;
if (this.scopeType !== "custom") {
const [includeNew, allowHashNew] = this.scopeFromType(
@ -300,6 +304,72 @@ export class ScopedSeed {
}
}
export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
let seeds = params.seeds as string[];
const scopedSeeds: ScopedSeed[] = [];
if (params.seedFile) {
let seedFilePath = params.seedFile as string;
if (
seedFilePath.startsWith("http://") ||
seedFilePath.startsWith("https://")
) {
seedFilePath = await collectOnlineSeedFile(seedFilePath);
}
const urlSeedFile = fs.readFileSync(seedFilePath, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");
if (typeof seeds === "string") {
seeds = [seeds];
}
for (const seed of urlSeedFileList) {
if (seed) {
seeds.push(seed);
}
}
}
const scopeOpts = {
scopeType: params.scopeType as ScopeType | undefined,
sitemap: params.sitemap,
include: params.include,
exclude: params.exclude,
depth: params.depth,
extraHops: params.extraHops,
};
for (const seed of seeds) {
const newSeed = typeof seed === "string" ? { url: seed } : seed;
try {
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Failed to create seed", {
error: e.toString(),
...scopeOpts,
...newSeed,
});
if (params.failOnFailedSeed) {
logger.fatal(
"Invalid seed specified, aborting crawl",
{ url: newSeed.url },
"general",
1,
);
}
}
}
if (!params.qaSource && !scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl");
}
return scopedSeeds;
}
export function rxEscape(string: string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}

View file

@ -351,7 +351,7 @@ export class PageWorker {
let loggedWaiting = false;
while (await this.crawler.isCrawlRunning()) {
await crawlState.processMessage(this.crawler.params.scopedSeeds);
await crawlState.processMessage(this.crawler.seeds);
const data = await crawlState.nextFromQueue();

View file

@ -1,8 +1,9 @@
import { parseArgs } from "../dist/util/argParser.js";
import { parseSeeds } from "../dist/util/seeds.js";
import fs from "fs";
function getSeeds(config) {
async function getSeeds(config) {
const orig = fs.readFileSync;
fs.readFileSync = (name, ...args) => {
@ -12,12 +13,12 @@ function getSeeds(config) {
return orig(name, ...args);
};
const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
return res.scopedSeeds;
const params = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
return await parseSeeds(params);
}
test("default scope", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- https://example.com/
@ -30,7 +31,7 @@ seeds:
});
test("default scope + exclude", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- https://example.com/
@ -45,7 +46,7 @@ exclude: https://example.com/pathexclude
});
test("default scope + exclude is numeric", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- https://example.com/
@ -60,7 +61,7 @@ exclude: "2022"
});
test("prefix scope global + exclude", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- https://example.com/
@ -76,7 +77,7 @@ exclude: https://example.com/pathexclude
});
test("prefix scope per seed + exclude", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://example.com/
scopeType: prefix
@ -92,7 +93,7 @@ exclude: https://example.com/pathexclude
});
test("host scope and domain scope", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://example.com/
@ -127,7 +128,7 @@ seeds:
});
test("domain scope drop www.", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://www.example.com/
scopeType: domain
@ -139,7 +140,7 @@ seeds:
});
test("custom scope", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://example.com/
include: https?://example.com/(path|other)
@ -153,7 +154,7 @@ seeds:
});
test("inherit scope", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://example.com/1
@ -177,7 +178,7 @@ exclude: https://example.com/pathexclude
});
test("override scope", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://example.com/1
@ -220,7 +221,7 @@ include: https://example.com/onlythispath
});
test("override scope with exclude", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://example.com/1
@ -275,7 +276,7 @@ exclude:
});
test("with exclude non-string types", async () => {
const seeds = getSeeds(`
const seeds = await getSeeds(`
seeds:
- url: https://example.com/
exclude: "2023"

View file

@ -38,3 +38,39 @@ test("check that URLs in seed-list are crawled", async () => {
}
expect(foundSeedUrl).toBe(true);
});
test("check that URLs in seed-list hosted at URL are crawled", async () => {
try {
await exec(
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
);
} catch (error) {
console.log(error);
}
let crawled_pages = fs.readFileSync(
"test-crawls/collections/onlinefilelisttest/pages/pages.jsonl",
"utf8",
);
let seed_file = fs
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
.split("\n")
.sort();
let seed_file_list = [];
for (var j = 0; j < seed_file.length; j++) {
if (seed_file[j] != undefined) {
seed_file_list.push(seed_file[j]);
}
}
let foundSeedUrl = true;
for (var i = 1; i < seed_file_list.length; i++) {
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
foundSeedUrl = false;
}
}
expect(foundSeedUrl).toBe(true);
});