browsertrix-crawler/tests/scopes.test.js
Ilya Kreymer 473de8c49f
Scope Handling Improvements + Tests (#66)
* scope fixes:
- remove default prefix scopeType, ensure scope include and exclude take precedence
- add new 'custom' scopeType, when include or exclude are used
- use --scopeIncludeRx and --scopeExcludeRx for better consistency for scope include and exclude (also allow --include/--exclude)
- ensure per-seed scope include/exclude used when present, and scopeType set to 'custom'
- ensure default scope is set to 'prefix' if no scopeType and no include/exclude regexes specified
- rename --type to --scopeType in seed to maintain consistency
- add sitemap param as alias for useSitemap

tests: 
- add seed scope resolution tests for argParse, testing per-scope seed resolution, inheritance and overrides
- fix screencaster to use relative paths to work with tests
- ci: use yarn instead of npm

* update README with new flags

* bump version to 0.4.0-beta.3
2021-07-06 20:22:27 -07:00

166 lines
4.4 KiB
JavaScript

const { parseArgs } = require("../util/argParser");
const fs = require("fs");
function getSeeds(config) {
const orig = fs.readFileSync;
fs.readFileSync = (name, ...args) => {
if (name.endsWith("/configtest")) {
return config;
}
return orig(name, ...args);
};
return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds;
}
test("default scope", async () => {
const seeds = getSeeds(`
seeds:
- https://example.com/
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([]);
});
test("custom scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/
include: https://example.com/(path|other)
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("inherit scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
- url: https://example.com/2
include: https://example.com/(path|other)
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(2);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
expect(seeds[1].scopeType).toEqual("custom");
expect(seeds[1].url).toEqual("https://example.com/2");
expect(seeds[1].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("override scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
include: https://example.com/(path|other)
- https://example.com/2
- url: https://example.com/subpath/file.html
scopeType: prefix
include: https://example.com/onlythispath
`);
expect(seeds.length).toEqual(3);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([]);
expect(seeds[1].scopeType).toEqual("custom");
expect(seeds[1].url).toEqual("https://example.com/2");
expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
expect(seeds[1].exclude).toEqual([]);
expect(seeds[2].scopeType).toEqual("prefix");
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
expect(seeds[2].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
expect(seeds[2].exclude).toEqual([]);
});
test("override scope with exclude", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
scopeType: page
- url: https://example.com/subpath/file.html
scopeType: prefix
- url: https://example.com/2
scopeType: any
- url: https://example.com/3
scopeType: none
- url: https://example.com/4
scopeType: none
exclude: ''
exclude:
- /search\\?
- q\\?
`);
expect(seeds.length).toEqual(5);
const excludeRxs = [/\/search\?/, /q\?/];
expect(seeds[0].scopeType).toEqual("page");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
expect(seeds[0].exclude).toEqual(excludeRxs);
expect(seeds[1].scopeType).toEqual("prefix");
expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
expect(seeds[1].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
expect(seeds[1].exclude).toEqual(excludeRxs);
expect(seeds[2].scopeType).toEqual("any");
expect(seeds[2].url).toEqual("https://example.com/2");
expect(seeds[2].include).toEqual([/.*/]);
expect(seeds[2].exclude).toEqual(excludeRxs);
expect(seeds[3].scopeType).toEqual("none");
expect(seeds[3].url).toEqual("https://example.com/3");
expect(seeds[3].include).toEqual([]);
expect(seeds[3].exclude).toEqual(excludeRxs);
expect(seeds[4].scopeType).toEqual("none");
expect(seeds[4].url).toEqual("https://example.com/4");
expect(seeds[4].include).toEqual([]);
expect(seeds[4].exclude).toEqual([]);
});