mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Support downloading seed file from URL (#852)
Fixes #841 Crawler work toward long URL lists in Browsertrix. This PR moves seed handling from the arg parser's validation step to the crawler's bootstrap step in order to be able to async fetch the seed file from a URL. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
687f08b1d0
commit
2af94ffab5
8 changed files with 189 additions and 103 deletions
|
@ -62,7 +62,7 @@ import {
|
||||||
} from "puppeteer-core";
|
} from "puppeteer-core";
|
||||||
import { Recorder } from "./util/recorder.js";
|
import { Recorder } from "./util/recorder.js";
|
||||||
import { SitemapReader } from "./util/sitemapper.js";
|
import { SitemapReader } from "./util/sitemapper.js";
|
||||||
import { ScopedSeed } from "./util/seeds.js";
|
import { ScopedSeed, parseSeeds } from "./util/seeds.js";
|
||||||
import {
|
import {
|
||||||
WARCWriter,
|
WARCWriter,
|
||||||
createWARCInfo,
|
createWARCInfo,
|
||||||
|
@ -134,7 +134,7 @@ export class Crawler {
|
||||||
|
|
||||||
maxPageTime: number;
|
maxPageTime: number;
|
||||||
|
|
||||||
seeds: ScopedSeed[];
|
seeds: ScopedSeed[] = [];
|
||||||
numOriginalSeeds = 0;
|
numOriginalSeeds = 0;
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -255,9 +255,6 @@ export class Crawler {
|
||||||
this.saveStateFiles = [];
|
this.saveStateFiles = [];
|
||||||
this.lastSaveTime = 0;
|
this.lastSaveTime = 0;
|
||||||
|
|
||||||
this.seeds = this.params.scopedSeeds as ScopedSeed[];
|
|
||||||
this.numOriginalSeeds = this.seeds.length;
|
|
||||||
|
|
||||||
// sum of page load + behavior timeouts + 2 x pageop timeouts (for cloudflare, link extraction) + extra page delay
|
// sum of page load + behavior timeouts + 2 x pageop timeouts (for cloudflare, link extraction) + extra page delay
|
||||||
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
|
// if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
|
||||||
this.maxPageTime =
|
this.maxPageTime =
|
||||||
|
@ -514,6 +511,9 @@ export class Crawler {
|
||||||
|
|
||||||
this.proxyServer = await initProxy(this.params, RUN_DETACHED);
|
this.proxyServer = await initProxy(this.params, RUN_DETACHED);
|
||||||
|
|
||||||
|
this.seeds = await parseSeeds(this.params);
|
||||||
|
this.numOriginalSeeds = this.seeds.length;
|
||||||
|
|
||||||
logger.info("Seeds", this.seeds);
|
logger.info("Seeds", this.seeds);
|
||||||
|
|
||||||
logger.info("Link Selectors", this.params.selectLinks);
|
logger.info("Link Selectors", this.params.selectLinks);
|
||||||
|
|
|
@ -96,8 +96,6 @@ export class ReplayCrawler extends Crawler {
|
||||||
// skip text from first two frames, as they are RWP boilerplate
|
// skip text from first two frames, as they are RWP boilerplate
|
||||||
this.skipTextDocs = SKIP_FRAMES;
|
this.skipTextDocs = SKIP_FRAMES;
|
||||||
|
|
||||||
this.params.scopedSeeds = [];
|
|
||||||
|
|
||||||
this.params.screenshot = ["view"];
|
this.params.screenshot = ["view"];
|
||||||
this.params.text = ["to-warc"];
|
this.params.text = ["to-warc"];
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@ import {
|
||||||
BxFunctionBindings,
|
BxFunctionBindings,
|
||||||
DEFAULT_CRAWL_ID_TEMPLATE,
|
DEFAULT_CRAWL_ID_TEMPLATE,
|
||||||
} from "./constants.js";
|
} from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
import { screenshotTypes } from "./screenshots.js";
|
import { screenshotTypes } from "./screenshots.js";
|
||||||
import {
|
import {
|
||||||
|
@ -37,12 +36,14 @@ export type CrawlerArgs = ReturnType<typeof parseArgs> & {
|
||||||
logExcludeContext: LogContext[];
|
logExcludeContext: LogContext[];
|
||||||
text: string[];
|
text: string[];
|
||||||
|
|
||||||
scopedSeeds: ScopedSeed[];
|
|
||||||
|
|
||||||
customBehaviors: string[];
|
customBehaviors: string[];
|
||||||
|
|
||||||
selectLinks: ExtractSelector[];
|
selectLinks: ExtractSelector[];
|
||||||
|
|
||||||
|
include: string[];
|
||||||
|
exclude: string[];
|
||||||
|
sitemap: boolean;
|
||||||
|
|
||||||
crawlId: string;
|
crawlId: string;
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -776,22 +777,6 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv.seedFile) {
|
|
||||||
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
|
|
||||||
const urlSeedFileList = urlSeedFile.split("\n");
|
|
||||||
|
|
||||||
if (typeof argv.seeds === "string") {
|
|
||||||
argv.seeds = [argv.seeds];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const seed of urlSeedFileList) {
|
|
||||||
if (seed) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(argv.seeds as any).push(seed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let selectLinks: ExtractSelector[];
|
let selectLinks: ExtractSelector[];
|
||||||
|
|
||||||
if (argv.selectLinks) {
|
if (argv.selectLinks) {
|
||||||
|
@ -823,50 +808,10 @@ class ArgParser {
|
||||||
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const scopedSeeds: ScopedSeed[] = [];
|
if (isQA && !argv.qaSource) {
|
||||||
|
|
||||||
if (!isQA) {
|
|
||||||
const scopeOpts = {
|
|
||||||
scopeType: argv.scopeType,
|
|
||||||
sitemap: argv.sitemap,
|
|
||||||
include: argv.include,
|
|
||||||
exclude: argv.exclude,
|
|
||||||
depth: argv.depth,
|
|
||||||
extraHops: argv.extraHops,
|
|
||||||
};
|
|
||||||
|
|
||||||
for (const seed of argv.seeds) {
|
|
||||||
const newSeed = typeof seed === "string" ? { url: seed } : seed;
|
|
||||||
|
|
||||||
try {
|
|
||||||
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
} catch (e: any) {
|
|
||||||
logger.error("Failed to create seed", {
|
|
||||||
error: e.toString(),
|
|
||||||
...scopeOpts,
|
|
||||||
...newSeed,
|
|
||||||
});
|
|
||||||
if (argv.failOnFailedSeed) {
|
|
||||||
logger.fatal(
|
|
||||||
"Invalid seed specified, aborting crawl",
|
|
||||||
{ url: newSeed.url },
|
|
||||||
"general",
|
|
||||||
1,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!scopedSeeds.length) {
|
|
||||||
logger.fatal("No valid seeds specified, aborting crawl");
|
|
||||||
}
|
|
||||||
} else if (!argv.qaSource) {
|
|
||||||
logger.fatal("--qaSource required for QA mode");
|
logger.fatal("--qaSource required for QA mode");
|
||||||
}
|
}
|
||||||
|
|
||||||
argv.scopedSeeds = scopedSeeds;
|
|
||||||
|
|
||||||
// Resolve statsFilename
|
// Resolve statsFilename
|
||||||
if (argv.statsFilename) {
|
if (argv.statsFilename) {
|
||||||
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
||||||
|
|
|
@ -24,6 +24,48 @@ export type FileSource = {
|
||||||
|
|
||||||
export type FileSources = FileSource[];
|
export type FileSources = FileSource[];
|
||||||
|
|
||||||
|
async function getTempFile(
|
||||||
|
filename: string,
|
||||||
|
dirPrefix: string,
|
||||||
|
): Promise<string> {
|
||||||
|
const tmpDir = path.join(
|
||||||
|
os.tmpdir(),
|
||||||
|
`${dirPrefix}-${crypto.randomBytes(4).toString("hex")}`,
|
||||||
|
);
|
||||||
|
await fsp.mkdir(tmpDir, { recursive: true });
|
||||||
|
return path.join(tmpDir, filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function writeUrlContentsToFile(
|
||||||
|
url: string,
|
||||||
|
pathPrefix: string,
|
||||||
|
pathDefaultExt: string,
|
||||||
|
) {
|
||||||
|
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
|
||||||
|
const fileContents = await res.text();
|
||||||
|
|
||||||
|
const filename =
|
||||||
|
path.basename(new URL(url).pathname) || "index." + pathDefaultExt;
|
||||||
|
const filepath = await getTempFile(filename, pathPrefix);
|
||||||
|
|
||||||
|
await fsp.writeFile(filepath, fileContents);
|
||||||
|
return filepath;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function collectOnlineSeedFile(url: string): Promise<string> {
|
||||||
|
try {
|
||||||
|
const filepath = await writeUrlContentsToFile(url, "seeds-", ".txt");
|
||||||
|
logger.info("Seed file downloaded", { url, path: filepath });
|
||||||
|
return filepath;
|
||||||
|
} catch (e) {
|
||||||
|
logger.fatal("Error downloading seed file from URL", {
|
||||||
|
url,
|
||||||
|
...formatErr(e),
|
||||||
|
});
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function collectCustomBehaviors(
|
export async function collectCustomBehaviors(
|
||||||
sources: string[],
|
sources: string[],
|
||||||
): Promise<FileSources> {
|
): Promise<FileSources> {
|
||||||
|
@ -79,7 +121,7 @@ async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
"Error downloading custom behaviors from Git repo",
|
"Error downloading custom behaviors from Git repo",
|
||||||
{ url: urlStripped, error: e },
|
{ url: urlStripped, ...formatErr(e) },
|
||||||
"behavior",
|
"behavior",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -87,18 +129,12 @@ async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
||||||
const filename = path.basename(new URL(url).pathname);
|
|
||||||
const tmpDir = path.join(
|
|
||||||
os.tmpdir(),
|
|
||||||
`behaviors-${crypto.randomBytes(4).toString("hex")}`,
|
|
||||||
);
|
|
||||||
await fsp.mkdir(tmpDir, { recursive: true });
|
|
||||||
const behaviorFilepath = path.join(tmpDir, filename);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
|
const behaviorFilepath = await writeUrlContentsToFile(
|
||||||
const fileContents = await res.text();
|
url,
|
||||||
await fsp.writeFile(behaviorFilepath, fileContents);
|
"behaviors-",
|
||||||
|
".js",
|
||||||
|
);
|
||||||
logger.info(
|
logger.info(
|
||||||
"Custom behavior file downloaded",
|
"Custom behavior file downloaded",
|
||||||
{ url, path: behaviorFilepath },
|
{ url, path: behaviorFilepath },
|
||||||
|
@ -108,7 +144,7 @@ async function collectOnlineBehavior(url: string): Promise<FileSources> {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
"Error downloading custom behavior from URL",
|
"Error downloading custom behavior from URL",
|
||||||
{ url, error: e },
|
{ url, ...formatErr(e) },
|
||||||
"behavior",
|
"behavior",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -190,7 +226,7 @@ async function collectLocalPathBehaviors(
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
"Error fetching local custom behaviors",
|
"Error fetching local custom behaviors",
|
||||||
{ path: resolvedPath, error: e },
|
{ path: resolvedPath, ...formatErr(e) },
|
||||||
"behavior",
|
"behavior",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
import { logger } from "./logger.js";
|
import fs from "fs";
|
||||||
|
|
||||||
import { MAX_DEPTH } from "./constants.js";
|
import { MAX_DEPTH } from "./constants.js";
|
||||||
|
import { collectOnlineSeedFile } from "./file_reader.js";
|
||||||
|
import { logger } from "./logger.js";
|
||||||
|
import { type CrawlerArgs } from "./argParser.js";
|
||||||
|
|
||||||
type ScopeType =
|
type ScopeType =
|
||||||
| "prefix"
|
| "prefix"
|
||||||
|
@ -39,14 +43,14 @@ export class ScopedSeed {
|
||||||
auth = null,
|
auth = null,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
scopeType: ScopeType;
|
scopeType: ScopeType | undefined;
|
||||||
include: string[];
|
include: string[];
|
||||||
exclude: string[];
|
exclude: string[];
|
||||||
allowHash?: boolean;
|
allowHash?: boolean;
|
||||||
depth?: number;
|
depth?: number;
|
||||||
sitemap?: string | boolean | null;
|
sitemap?: string | boolean | null;
|
||||||
extraHops?: number;
|
extraHops?: number;
|
||||||
auth: string | null;
|
auth?: string | null;
|
||||||
}) {
|
}) {
|
||||||
const parsedUrl = this.parseUrl(url);
|
const parsedUrl = this.parseUrl(url);
|
||||||
if (!parsedUrl) {
|
if (!parsedUrl) {
|
||||||
|
@ -62,14 +66,14 @@ export class ScopedSeed {
|
||||||
this.url = parsedUrl.href;
|
this.url = parsedUrl.href;
|
||||||
this.include = parseRx(include);
|
this.include = parseRx(include);
|
||||||
this.exclude = parseRx(exclude);
|
this.exclude = parseRx(exclude);
|
||||||
this.scopeType = scopeType;
|
|
||||||
|
|
||||||
this._includeStr = include;
|
this._includeStr = include;
|
||||||
this._excludeStr = exclude;
|
this._excludeStr = exclude;
|
||||||
|
|
||||||
if (!this.scopeType) {
|
if (!scopeType) {
|
||||||
this.scopeType = this.include.length ? "custom" : "prefix";
|
scopeType = this.include.length ? "custom" : "prefix";
|
||||||
}
|
}
|
||||||
|
this.scopeType = scopeType;
|
||||||
|
|
||||||
if (this.scopeType !== "custom") {
|
if (this.scopeType !== "custom") {
|
||||||
const [includeNew, allowHashNew] = this.scopeFromType(
|
const [includeNew, allowHashNew] = this.scopeFromType(
|
||||||
|
@ -300,6 +304,72 @@ export class ScopedSeed {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
|
||||||
|
let seeds = params.seeds as string[];
|
||||||
|
const scopedSeeds: ScopedSeed[] = [];
|
||||||
|
|
||||||
|
if (params.seedFile) {
|
||||||
|
let seedFilePath = params.seedFile as string;
|
||||||
|
if (
|
||||||
|
seedFilePath.startsWith("http://") ||
|
||||||
|
seedFilePath.startsWith("https://")
|
||||||
|
) {
|
||||||
|
seedFilePath = await collectOnlineSeedFile(seedFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlSeedFile = fs.readFileSync(seedFilePath, "utf8");
|
||||||
|
const urlSeedFileList = urlSeedFile.split("\n");
|
||||||
|
|
||||||
|
if (typeof seeds === "string") {
|
||||||
|
seeds = [seeds];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const seed of urlSeedFileList) {
|
||||||
|
if (seed) {
|
||||||
|
seeds.push(seed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const scopeOpts = {
|
||||||
|
scopeType: params.scopeType as ScopeType | undefined,
|
||||||
|
sitemap: params.sitemap,
|
||||||
|
include: params.include,
|
||||||
|
exclude: params.exclude,
|
||||||
|
depth: params.depth,
|
||||||
|
extraHops: params.extraHops,
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const seed of seeds) {
|
||||||
|
const newSeed = typeof seed === "string" ? { url: seed } : seed;
|
||||||
|
|
||||||
|
try {
|
||||||
|
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
} catch (e: any) {
|
||||||
|
logger.error("Failed to create seed", {
|
||||||
|
error: e.toString(),
|
||||||
|
...scopeOpts,
|
||||||
|
...newSeed,
|
||||||
|
});
|
||||||
|
if (params.failOnFailedSeed) {
|
||||||
|
logger.fatal(
|
||||||
|
"Invalid seed specified, aborting crawl",
|
||||||
|
{ url: newSeed.url },
|
||||||
|
"general",
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!params.qaSource && !scopedSeeds.length) {
|
||||||
|
logger.fatal("No valid seeds specified, aborting crawl");
|
||||||
|
}
|
||||||
|
|
||||||
|
return scopedSeeds;
|
||||||
|
}
|
||||||
|
|
||||||
export function rxEscape(string: string) {
|
export function rxEscape(string: string) {
|
||||||
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
||||||
}
|
}
|
||||||
|
|
|
@ -351,7 +351,7 @@ export class PageWorker {
|
||||||
let loggedWaiting = false;
|
let loggedWaiting = false;
|
||||||
|
|
||||||
while (await this.crawler.isCrawlRunning()) {
|
while (await this.crawler.isCrawlRunning()) {
|
||||||
await crawlState.processMessage(this.crawler.params.scopedSeeds);
|
await crawlState.processMessage(this.crawler.seeds);
|
||||||
|
|
||||||
const data = await crawlState.nextFromQueue();
|
const data = await crawlState.nextFromQueue();
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import { parseArgs } from "../dist/util/argParser.js";
|
import { parseArgs } from "../dist/util/argParser.js";
|
||||||
|
import { parseSeeds } from "../dist/util/seeds.js";
|
||||||
|
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
function getSeeds(config) {
|
async function getSeeds(config) {
|
||||||
const orig = fs.readFileSync;
|
const orig = fs.readFileSync;
|
||||||
|
|
||||||
fs.readFileSync = (name, ...args) => {
|
fs.readFileSync = (name, ...args) => {
|
||||||
|
@ -12,12 +13,12 @@ function getSeeds(config) {
|
||||||
return orig(name, ...args);
|
return orig(name, ...args);
|
||||||
};
|
};
|
||||||
|
|
||||||
const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
|
const params = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
|
||||||
return res.scopedSeeds;
|
return await parseSeeds(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
test("default scope", async () => {
|
test("default scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- https://example.com/
|
- https://example.com/
|
||||||
|
|
||||||
|
@ -30,7 +31,7 @@ seeds:
|
||||||
});
|
});
|
||||||
|
|
||||||
test("default scope + exclude", async () => {
|
test("default scope + exclude", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- https://example.com/
|
- https://example.com/
|
||||||
|
|
||||||
|
@ -45,7 +46,7 @@ exclude: https://example.com/pathexclude
|
||||||
});
|
});
|
||||||
|
|
||||||
test("default scope + exclude is numeric", async () => {
|
test("default scope + exclude is numeric", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- https://example.com/
|
- https://example.com/
|
||||||
|
|
||||||
|
@ -60,7 +61,7 @@ exclude: "2022"
|
||||||
});
|
});
|
||||||
|
|
||||||
test("prefix scope global + exclude", async () => {
|
test("prefix scope global + exclude", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- https://example.com/
|
- https://example.com/
|
||||||
|
|
||||||
|
@ -76,7 +77,7 @@ exclude: https://example.com/pathexclude
|
||||||
});
|
});
|
||||||
|
|
||||||
test("prefix scope per seed + exclude", async () => {
|
test("prefix scope per seed + exclude", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/
|
- url: https://example.com/
|
||||||
scopeType: prefix
|
scopeType: prefix
|
||||||
|
@ -92,7 +93,7 @@ exclude: https://example.com/pathexclude
|
||||||
});
|
});
|
||||||
|
|
||||||
test("host scope and domain scope", async () => {
|
test("host scope and domain scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/
|
- url: https://example.com/
|
||||||
|
@ -127,7 +128,7 @@ seeds:
|
||||||
});
|
});
|
||||||
|
|
||||||
test("domain scope drop www.", async () => {
|
test("domain scope drop www.", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://www.example.com/
|
- url: https://www.example.com/
|
||||||
scopeType: domain
|
scopeType: domain
|
||||||
|
@ -139,7 +140,7 @@ seeds:
|
||||||
});
|
});
|
||||||
|
|
||||||
test("custom scope", async () => {
|
test("custom scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/
|
- url: https://example.com/
|
||||||
include: https?://example.com/(path|other)
|
include: https?://example.com/(path|other)
|
||||||
|
@ -153,7 +154,7 @@ seeds:
|
||||||
});
|
});
|
||||||
|
|
||||||
test("inherit scope", async () => {
|
test("inherit scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/1
|
- url: https://example.com/1
|
||||||
|
@ -177,7 +178,7 @@ exclude: https://example.com/pathexclude
|
||||||
});
|
});
|
||||||
|
|
||||||
test("override scope", async () => {
|
test("override scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/1
|
- url: https://example.com/1
|
||||||
|
@ -220,7 +221,7 @@ include: https://example.com/onlythispath
|
||||||
});
|
});
|
||||||
|
|
||||||
test("override scope with exclude", async () => {
|
test("override scope with exclude", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/1
|
- url: https://example.com/1
|
||||||
|
@ -275,7 +276,7 @@ exclude:
|
||||||
});
|
});
|
||||||
|
|
||||||
test("with exclude non-string types", async () => {
|
test("with exclude non-string types", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = await getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/
|
- url: https://example.com/
|
||||||
exclude: "2023"
|
exclude: "2023"
|
||||||
|
|
|
@ -38,3 +38,39 @@ test("check that URLs in seed-list are crawled", async () => {
|
||||||
}
|
}
|
||||||
expect(foundSeedUrl).toBe(true);
|
expect(foundSeedUrl).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("check that URLs in seed-list hosted at URL are crawled", async () => {
|
||||||
|
try {
|
||||||
|
await exec(
|
||||||
|
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.log(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
let crawled_pages = fs.readFileSync(
|
||||||
|
"test-crawls/collections/onlinefilelisttest/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
let seed_file = fs
|
||||||
|
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
||||||
|
.split("\n")
|
||||||
|
.sort();
|
||||||
|
|
||||||
|
let seed_file_list = [];
|
||||||
|
for (var j = 0; j < seed_file.length; j++) {
|
||||||
|
if (seed_file[j] != undefined) {
|
||||||
|
seed_file_list.push(seed_file[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let foundSeedUrl = true;
|
||||||
|
|
||||||
|
for (var i = 1; i < seed_file_list.length; i++) {
|
||||||
|
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
||||||
|
foundSeedUrl = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(foundSeedUrl).toBe(true);
|
||||||
|
});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue