From 2af94ffab553ff0f90124f824f2ac8bb077cf500 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 3 Jul 2025 10:49:37 -0400 Subject: [PATCH] Support downloading seed file from URL (#852) Fixes #841 Crawler work toward long URL lists in Browsertrix. This PR moves seed handling from the arg parser's validation step to the crawler's bootstrap step in order to be able to async fetch the seed file from a URL. --------- Co-authored-by: Ilya Kreymer Co-authored-by: Ilya Kreymer --- src/crawler.ts | 10 ++--- src/replaycrawler.ts | 2 - src/util/argParser.ts | 65 +++-------------------------- src/util/file_reader.ts | 64 ++++++++++++++++++++++------- src/util/seeds.ts | 82 ++++++++++++++++++++++++++++++++++--- src/util/worker.ts | 2 +- tests/scopes.test.js | 31 +++++++------- tests/url_file_list.test.js | 36 ++++++++++++++++ 8 files changed, 189 insertions(+), 103 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index e7276e95..448f7f9b 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -62,7 +62,7 @@ import { } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; -import { ScopedSeed } from "./util/seeds.js"; +import { ScopedSeed, parseSeeds } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, @@ -134,7 +134,7 @@ export class Crawler { maxPageTime: number; - seeds: ScopedSeed[]; + seeds: ScopedSeed[] = []; numOriginalSeeds = 0; // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -255,9 +255,6 @@ export class Crawler { this.saveStateFiles = []; this.lastSaveTime = 0; - this.seeds = this.params.scopedSeeds as ScopedSeed[]; - this.numOriginalSeeds = this.seeds.length; - // sum of page load + behavior timeouts + 2 x pageop timeouts (for cloudflare, link extraction) + extra page delay // if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck) this.maxPageTime = @@ -514,6 +511,9 @@ export class Crawler { this.proxyServer = await initProxy(this.params, RUN_DETACHED); + this.seeds = await parseSeeds(this.params); + this.numOriginalSeeds = this.seeds.length; + logger.info("Seeds", this.seeds); logger.info("Link Selectors", this.params.selectLinks); diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 75abfc4e..819bcf39 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -96,8 +96,6 @@ export class ReplayCrawler extends Crawler { // skip text from first two frames, as they are RWP boilerplate this.skipTextDocs = SKIP_FRAMES; - this.params.scopedSeeds = []; - this.params.screenshot = ["view"]; this.params.text = ["to-warc"]; diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 41b8bd58..21b2db88 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -20,7 +20,6 @@ import { BxFunctionBindings, DEFAULT_CRAWL_ID_TEMPLATE, } from "./constants.js"; -import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; import { screenshotTypes } from "./screenshots.js"; import { @@ -37,12 +36,14 @@ export type CrawlerArgs = ReturnType & { logExcludeContext: LogContext[]; text: string[]; - scopedSeeds: ScopedSeed[]; - customBehaviors: string[]; selectLinks: ExtractSelector[]; + include: string[]; + exclude: string[]; + sitemap: boolean; + crawlId: string; // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -776,22 +777,6 @@ class ArgParser { } } - if (argv.seedFile) { - const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8"); - const urlSeedFileList = urlSeedFile.split("\n"); - - if (typeof argv.seeds === "string") { - argv.seeds = [argv.seeds]; - } - - for (const seed of urlSeedFileList) { - if (seed) { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (argv.seeds as any).push(seed); - } - } - } - let selectLinks: ExtractSelector[]; if (argv.selectLinks) { @@ -823,50 +808,10 @@ class ArgParser { //logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`); } - const scopedSeeds: ScopedSeed[] = []; - - if (!isQA) { - const scopeOpts = { - scopeType: argv.scopeType, - sitemap: argv.sitemap, - include: argv.include, - exclude: argv.exclude, - depth: argv.depth, - extraHops: argv.extraHops, - }; - - for (const seed of argv.seeds) { - const newSeed = typeof seed === "string" ? { url: seed } : seed; - - try { - scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } catch (e: any) { - logger.error("Failed to create seed", { - error: e.toString(), - ...scopeOpts, - ...newSeed, - }); - if (argv.failOnFailedSeed) { - logger.fatal( - "Invalid seed specified, aborting crawl", - { url: newSeed.url }, - "general", - 1, - ); - } - } - } - - if (!scopedSeeds.length) { - logger.fatal("No valid seeds specified, aborting crawl"); - } - } else if (!argv.qaSource) { + if (isQA && !argv.qaSource) { logger.fatal("--qaSource required for QA mode"); } - argv.scopedSeeds = scopedSeeds; - // Resolve statsFilename if (argv.statsFilename) { argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename); diff --git a/src/util/file_reader.ts b/src/util/file_reader.ts index fa8ad0bc..f0908d12 100644 --- a/src/util/file_reader.ts +++ b/src/util/file_reader.ts @@ -24,6 +24,48 @@ export type FileSource = { export type FileSources = FileSource[]; +async function getTempFile( + filename: string, + dirPrefix: string, +): Promise { + const tmpDir = path.join( + os.tmpdir(), + `${dirPrefix}-${crypto.randomBytes(4).toString("hex")}`, + ); + await fsp.mkdir(tmpDir, { recursive: true }); + return path.join(tmpDir, filename); +} + +async function writeUrlContentsToFile( + url: string, + pathPrefix: string, + pathDefaultExt: string, +) { + const res = await fetch(url, { dispatcher: getProxyDispatcher() }); + const fileContents = await res.text(); + + const filename = + path.basename(new URL(url).pathname) || "index." + pathDefaultExt; + const filepath = await getTempFile(filename, pathPrefix); + + await fsp.writeFile(filepath, fileContents); + return filepath; +} + +export async function collectOnlineSeedFile(url: string): Promise { + try { + const filepath = await writeUrlContentsToFile(url, "seeds-", ".txt"); + logger.info("Seed file downloaded", { url, path: filepath }); + return filepath; + } catch (e) { + logger.fatal("Error downloading seed file from URL", { + url, + ...formatErr(e), + }); + throw e; + } +} + export async function collectCustomBehaviors( sources: string[], ): Promise { @@ -79,7 +121,7 @@ async function collectGitBehaviors(gitUrl: string): Promise { } catch (e) { logger.fatal( "Error downloading custom behaviors from Git repo", - { url: urlStripped, error: e }, + { url: urlStripped, ...formatErr(e) }, "behavior", ); } @@ -87,18 +129,12 @@ async function collectGitBehaviors(gitUrl: string): Promise { } async function collectOnlineBehavior(url: string): Promise { - const filename = path.basename(new URL(url).pathname); - const tmpDir = path.join( - os.tmpdir(), - `behaviors-${crypto.randomBytes(4).toString("hex")}`, - ); - await fsp.mkdir(tmpDir, { recursive: true }); - const behaviorFilepath = path.join(tmpDir, filename); - try { - const res = await fetch(url, { dispatcher: getProxyDispatcher() }); - const fileContents = await res.text(); - await fsp.writeFile(behaviorFilepath, fileContents); + const behaviorFilepath = await writeUrlContentsToFile( + url, + "behaviors-", + ".js", + ); logger.info( "Custom behavior file downloaded", { url, path: behaviorFilepath }, @@ -108,7 +144,7 @@ async function collectOnlineBehavior(url: string): Promise { } catch (e) { logger.fatal( "Error downloading custom behavior from URL", - { url, error: e }, + { url, ...formatErr(e) }, "behavior", ); } @@ -190,7 +226,7 @@ async function collectLocalPathBehaviors( } catch (e) { logger.fatal( "Error fetching local custom behaviors", - { path: resolvedPath, error: e }, + { path: resolvedPath, ...formatErr(e) }, "behavior", ); } diff --git a/src/util/seeds.ts b/src/util/seeds.ts index d0e24445..ade15208 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -1,5 +1,9 @@ -import { logger } from "./logger.js"; +import fs from "fs"; + import { MAX_DEPTH } from "./constants.js"; +import { collectOnlineSeedFile } from "./file_reader.js"; +import { logger } from "./logger.js"; +import { type CrawlerArgs } from "./argParser.js"; type ScopeType = | "prefix" @@ -39,14 +43,14 @@ export class ScopedSeed { auth = null, }: { url: string; - scopeType: ScopeType; + scopeType: ScopeType | undefined; include: string[]; exclude: string[]; allowHash?: boolean; depth?: number; sitemap?: string | boolean | null; extraHops?: number; - auth: string | null; + auth?: string | null; }) { const parsedUrl = this.parseUrl(url); if (!parsedUrl) { @@ -62,14 +66,14 @@ export class ScopedSeed { this.url = parsedUrl.href; this.include = parseRx(include); this.exclude = parseRx(exclude); - this.scopeType = scopeType; this._includeStr = include; this._excludeStr = exclude; - if (!this.scopeType) { - this.scopeType = this.include.length ? "custom" : "prefix"; + if (!scopeType) { + scopeType = this.include.length ? "custom" : "prefix"; } + this.scopeType = scopeType; if (this.scopeType !== "custom") { const [includeNew, allowHashNew] = this.scopeFromType( @@ -300,6 +304,72 @@ export class ScopedSeed { } } +export async function parseSeeds(params: CrawlerArgs): Promise { + let seeds = params.seeds as string[]; + const scopedSeeds: ScopedSeed[] = []; + + if (params.seedFile) { + let seedFilePath = params.seedFile as string; + if ( + seedFilePath.startsWith("http://") || + seedFilePath.startsWith("https://") + ) { + seedFilePath = await collectOnlineSeedFile(seedFilePath); + } + + const urlSeedFile = fs.readFileSync(seedFilePath, "utf8"); + const urlSeedFileList = urlSeedFile.split("\n"); + + if (typeof seeds === "string") { + seeds = [seeds]; + } + + for (const seed of urlSeedFileList) { + if (seed) { + seeds.push(seed); + } + } + } + + const scopeOpts = { + scopeType: params.scopeType as ScopeType | undefined, + sitemap: params.sitemap, + include: params.include, + exclude: params.exclude, + depth: params.depth, + extraHops: params.extraHops, + }; + + for (const seed of seeds) { + const newSeed = typeof seed === "string" ? { url: seed } : seed; + + try { + scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + logger.error("Failed to create seed", { + error: e.toString(), + ...scopeOpts, + ...newSeed, + }); + if (params.failOnFailedSeed) { + logger.fatal( + "Invalid seed specified, aborting crawl", + { url: newSeed.url }, + "general", + 1, + ); + } + } + } + + if (!params.qaSource && !scopedSeeds.length) { + logger.fatal("No valid seeds specified, aborting crawl"); + } + + return scopedSeeds; +} + export function rxEscape(string: string) { return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&"); } diff --git a/src/util/worker.ts b/src/util/worker.ts index f4dc8ddf..dee9ceba 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -351,7 +351,7 @@ export class PageWorker { let loggedWaiting = false; while (await this.crawler.isCrawlRunning()) { - await crawlState.processMessage(this.crawler.params.scopedSeeds); + await crawlState.processMessage(this.crawler.seeds); const data = await crawlState.nextFromQueue(); diff --git a/tests/scopes.test.js b/tests/scopes.test.js index ddb64e04..9717fb11 100644 --- a/tests/scopes.test.js +++ b/tests/scopes.test.js @@ -1,8 +1,9 @@ import { parseArgs } from "../dist/util/argParser.js"; +import { parseSeeds } from "../dist/util/seeds.js"; import fs from "fs"; -function getSeeds(config) { +async function getSeeds(config) { const orig = fs.readFileSync; fs.readFileSync = (name, ...args) => { @@ -12,12 +13,12 @@ function getSeeds(config) { return orig(name, ...args); }; - const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]); - return res.scopedSeeds; + const params = parseArgs(["node", "crawler", "--config", "stdinconfig"]); + return await parseSeeds(params); } test("default scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -30,7 +31,7 @@ seeds: }); test("default scope + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -45,7 +46,7 @@ exclude: https://example.com/pathexclude }); test("default scope + exclude is numeric", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -60,7 +61,7 @@ exclude: "2022" }); test("prefix scope global + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -76,7 +77,7 @@ exclude: https://example.com/pathexclude }); test("prefix scope per seed + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ scopeType: prefix @@ -92,7 +93,7 @@ exclude: https://example.com/pathexclude }); test("host scope and domain scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ @@ -127,7 +128,7 @@ seeds: }); test("domain scope drop www.", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://www.example.com/ scopeType: domain @@ -139,7 +140,7 @@ seeds: }); test("custom scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ include: https?://example.com/(path|other) @@ -153,7 +154,7 @@ seeds: }); test("inherit scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -177,7 +178,7 @@ exclude: https://example.com/pathexclude }); test("override scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -220,7 +221,7 @@ include: https://example.com/onlythispath }); test("override scope with exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -275,7 +276,7 @@ exclude: }); test("with exclude non-string types", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ exclude: "2023" diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 9901ff36..c76afa6e 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -38,3 +38,39 @@ test("check that URLs in seed-list are crawled", async () => { } expect(foundSeedUrl).toBe(true); }); + + +test("check that URLs in seed-list hosted at URL are crawled", async () => { + try { + await exec( + 'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000', + ); + } catch (error) { + console.log(error); + } + + let crawled_pages = fs.readFileSync( + "test-crawls/collections/onlinefilelisttest/pages/pages.jsonl", + "utf8", + ); + let seed_file = fs + .readFileSync("tests/fixtures/urlSeedFile.txt", "utf8") + .split("\n") + .sort(); + + let seed_file_list = []; + for (var j = 0; j < seed_file.length; j++) { + if (seed_file[j] != undefined) { + seed_file_list.push(seed_file[j]); + } + } + + let foundSeedUrl = true; + + for (var i = 1; i < seed_file_list.length; i++) { + if (crawled_pages.indexOf(seed_file_list[i]) == -1) { + foundSeedUrl = false; + } + } + expect(foundSeedUrl).toBe(true); +});