mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

Fixes #841 Crawler work toward long URL lists in Browsertrix. This PR moves seed handling from the arg parser's validation step to the crawler's bootstrap step in order to be able to async fetch the seed file from a URL. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
328 lines
9 KiB
JavaScript
328 lines
9 KiB
JavaScript
import { parseArgs } from "../dist/util/argParser.js";
|
|
import { parseSeeds } from "../dist/util/seeds.js";
|
|
|
|
import fs from "fs";
|
|
|
|
async function getSeeds(config) {
|
|
const orig = fs.readFileSync;
|
|
|
|
fs.readFileSync = (name, ...args) => {
|
|
if (name.endsWith("/stdinconfig")) {
|
|
return config;
|
|
}
|
|
return orig(name, ...args);
|
|
};
|
|
|
|
const params = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
|
|
return await parseSeeds(params);
|
|
}
|
|
|
|
test("default scope", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- https://example.com/
|
|
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
expect(seeds[0].scopeType).toEqual("prefix");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
|
expect(seeds[0].exclude).toEqual([]);
|
|
});
|
|
|
|
test("default scope + exclude", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- https://example.com/
|
|
|
|
exclude: https://example.com/pathexclude
|
|
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
expect(seeds[0].scopeType).toEqual("prefix");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
});
|
|
|
|
test("default scope + exclude is numeric", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- https://example.com/
|
|
|
|
exclude: "2022"
|
|
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
expect(seeds[0].scopeType).toEqual("prefix");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
|
expect(seeds[0].exclude).toEqual([/2022/]);
|
|
});
|
|
|
|
test("prefix scope global + exclude", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- https://example.com/
|
|
|
|
scopeType: prefix
|
|
exclude: https://example.com/pathexclude
|
|
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
expect(seeds[0].scopeType).toEqual("prefix");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
});
|
|
|
|
test("prefix scope per seed + exclude", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- url: https://example.com/
|
|
scopeType: prefix
|
|
|
|
exclude: https://example.com/pathexclude
|
|
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
expect(seeds[0].scopeType).toEqual("prefix");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
});
|
|
|
|
test("host scope and domain scope", async () => {
|
|
const seeds = await getSeeds(`
|
|
|
|
seeds:
|
|
- url: https://example.com/
|
|
scopeType: domain
|
|
|
|
- url: https://example.org/
|
|
scopeType: host
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(2);
|
|
expect(seeds[0].scopeType).toEqual("domain");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
|
expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true);
|
|
expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true);
|
|
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(
|
|
true,
|
|
);
|
|
expect(
|
|
!!seeds[0].include[0].exec("https://sub.domain.example.com/path"),
|
|
).toEqual(true);
|
|
expect(
|
|
!!seeds[0].include[0].exec("https://notsub.domainexample.com/path"),
|
|
).toEqual(false);
|
|
|
|
expect(seeds[1].scopeType).toEqual("host");
|
|
expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]);
|
|
expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true);
|
|
expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true);
|
|
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(
|
|
false,
|
|
);
|
|
});
|
|
|
|
test("domain scope drop www.", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- url: https://www.example.com/
|
|
scopeType: domain
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
expect(seeds[0].scopeType).toEqual("domain");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
|
});
|
|
|
|
test("custom scope", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- url: https://example.com/
|
|
include: https?://example.com/(path|other)
|
|
exclude: https?://example.com/pathexclude
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
expect(seeds[0].scopeType).toEqual("custom");
|
|
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
|
expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]);
|
|
});
|
|
|
|
test("inherit scope", async () => {
|
|
const seeds = await getSeeds(`
|
|
|
|
seeds:
|
|
- url: https://example.com/1
|
|
- url: https://example.com/2
|
|
|
|
include: https?://example.com/(path|other)
|
|
exclude: https://example.com/pathexclude
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(2);
|
|
|
|
expect(seeds[0].scopeType).toEqual("custom");
|
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
|
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
|
|
expect(seeds[1].scopeType).toEqual("custom");
|
|
expect(seeds[1].url).toEqual("https://example.com/2");
|
|
expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
|
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
});
|
|
|
|
test("override scope", async () => {
|
|
const seeds = await getSeeds(`
|
|
|
|
seeds:
|
|
- url: https://example.com/1
|
|
include: https://example.com/(path|other)
|
|
|
|
- https://example.com/2
|
|
|
|
- url: https://example.com/subpath/file.html
|
|
scopeType: prefix
|
|
|
|
- url: https://example.com/subpath/file.html
|
|
|
|
include: https://example.com/onlythispath
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(4);
|
|
|
|
expect(seeds[0].scopeType).toEqual("custom");
|
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
|
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
|
expect(seeds[0].exclude).toEqual([]);
|
|
|
|
expect(seeds[1].scopeType).toEqual("custom");
|
|
expect(seeds[1].url).toEqual("https://example.com/2");
|
|
expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
|
|
expect(seeds[1].exclude).toEqual([]);
|
|
|
|
expect(seeds[2].scopeType).toEqual("prefix");
|
|
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
|
expect(seeds[2].include).toEqual([
|
|
/^https?:\/\/example\.com\/subpath\//,
|
|
/https:\/\/example.com\/onlythispath/,
|
|
]);
|
|
expect(seeds[2].exclude).toEqual([]);
|
|
|
|
expect(seeds[3].scopeType).toEqual("custom");
|
|
expect(seeds[3].url).toEqual("https://example.com/subpath/file.html");
|
|
expect(seeds[3].include).toEqual([/https:\/\/example.com\/onlythispath/]);
|
|
expect(seeds[3].exclude).toEqual([]);
|
|
});
|
|
|
|
test("override scope with exclude", async () => {
|
|
const seeds = await getSeeds(`
|
|
|
|
seeds:
|
|
- url: https://example.com/1
|
|
scopeType: page-spa
|
|
|
|
- url: https://example.com/subpath/file.html
|
|
scopeType: prefix
|
|
|
|
- url: https://example.com/2
|
|
scopeType: any
|
|
|
|
- url: https://example.com/3
|
|
scopeType: page
|
|
|
|
- url: https://example.com/4
|
|
scopeType: page
|
|
exclude: ''
|
|
|
|
exclude:
|
|
- /search\\?
|
|
- q\\?
|
|
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(5);
|
|
const excludeRxs = [/\/search\?/, /q\?/];
|
|
|
|
expect(seeds[0].scopeType).toEqual("page-spa");
|
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
|
|
expect(seeds[0].exclude).toEqual(excludeRxs);
|
|
|
|
expect(seeds[1].scopeType).toEqual("prefix");
|
|
expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
|
|
expect(seeds[1].include).toEqual([/^https?:\/\/example\.com\/subpath\//]);
|
|
expect(seeds[1].exclude).toEqual(excludeRxs);
|
|
|
|
expect(seeds[2].scopeType).toEqual("any");
|
|
expect(seeds[2].url).toEqual("https://example.com/2");
|
|
expect(seeds[2].include).toEqual([/.*/]);
|
|
expect(seeds[2].exclude).toEqual(excludeRxs);
|
|
|
|
expect(seeds[3].scopeType).toEqual("page");
|
|
expect(seeds[3].url).toEqual("https://example.com/3");
|
|
expect(seeds[3].include).toEqual([]);
|
|
expect(seeds[3].exclude).toEqual(excludeRxs);
|
|
|
|
expect(seeds[4].scopeType).toEqual("page");
|
|
expect(seeds[4].url).toEqual("https://example.com/4");
|
|
expect(seeds[4].include).toEqual([]);
|
|
expect(seeds[4].exclude).toEqual([]);
|
|
});
|
|
|
|
test("with exclude non-string types", async () => {
|
|
const seeds = await getSeeds(`
|
|
seeds:
|
|
- url: https://example.com/
|
|
exclude: "2023"
|
|
|
|
- url: https://example.com/
|
|
exclude: 2023
|
|
|
|
- url: https://example.com/
|
|
exclude: "0"
|
|
|
|
- url: https://example.com/
|
|
exclude: 0
|
|
|
|
- url: https://example.com/
|
|
exclude:
|
|
|
|
- url: https://example.com/
|
|
exclude: ""
|
|
|
|
- url: https://example.com/
|
|
exclude: null
|
|
|
|
- url: https://example.com/
|
|
exclude: "null"
|
|
|
|
- url: https://example.com/
|
|
exclude: false
|
|
|
|
- url: https://example.com/
|
|
exclude: true
|
|
`);
|
|
|
|
expect(seeds.length).toEqual(10);
|
|
for (let i = 0; i < 10; i++) {
|
|
expect(seeds[i].scopeType).toEqual("prefix");
|
|
expect(seeds[i].include).toEqual([/^https?:\/\/example\.com\//]);
|
|
}
|
|
|
|
expect(seeds[0].exclude).toEqual([/2023/]);
|
|
expect(seeds[1].exclude).toEqual([/2023/]);
|
|
expect(seeds[2].exclude).toEqual([/0/]);
|
|
expect(seeds[3].exclude).toEqual([/0/]);
|
|
expect(seeds[4].exclude).toEqual([]);
|
|
expect(seeds[5].exclude).toEqual([]);
|
|
expect(seeds[6].exclude).toEqual([]);
|
|
expect(seeds[7].exclude).toEqual([/null/]);
|
|
expect(seeds[8].exclude).toEqual([/false/]);
|
|
expect(seeds[9].exclude).toEqual([/true/]);
|
|
});
|