2021-07-06 20:22:27 -07:00
|
|
|
const { parseArgs } = require("../util/argParser");
|
|
|
|
|
|
|
|
const fs = require("fs");
|
|
|
|
|
|
|
|
function getSeeds(config) {
|
|
|
|
const orig = fs.readFileSync;
|
|
|
|
|
|
|
|
fs.readFileSync = (name, ...args) => {
|
|
|
|
if (name.endsWith("/configtest")) {
|
|
|
|
return config;
|
|
|
|
}
|
|
|
|
return orig(name, ...args);
|
|
|
|
};
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
return parseArgs(["node", "crawler", "--config", "configtest"]).scopedSeeds;
|
2021-07-06 20:22:27 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
test("default scope", async () => {
|
|
|
|
const seeds = getSeeds(`
|
|
|
|
seeds:
|
|
|
|
- https://example.com/
|
|
|
|
|
|
|
|
`);
|
|
|
|
|
|
|
|
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
|
|
expect(seeds[0].scopeType).toEqual("prefix");
|
|
|
|
expect(seeds[0].include).toEqual([/^https:\/\/example\.com\//]);
|
|
|
|
expect(seeds[0].exclude).toEqual([]);
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
test("custom scope", async () => {
|
|
|
|
const seeds = getSeeds(`
|
|
|
|
seeds:
|
|
|
|
- url: https://example.com/
|
|
|
|
include: https://example.com/(path|other)
|
|
|
|
exclude: https://example.com/pathexclude
|
|
|
|
`);
|
|
|
|
|
|
|
|
|
|
|
|
expect(seeds.length).toEqual(1);
|
|
|
|
expect(seeds[0].scopeType).toEqual("custom");
|
|
|
|
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
|
|
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
test("inherit scope", async () => {
|
|
|
|
const seeds = getSeeds(`
|
|
|
|
|
|
|
|
seeds:
|
|
|
|
- url: https://example.com/1
|
|
|
|
- url: https://example.com/2
|
|
|
|
|
|
|
|
include: https://example.com/(path|other)
|
|
|
|
exclude: https://example.com/pathexclude
|
|
|
|
`);
|
|
|
|
|
|
|
|
|
|
|
|
expect(seeds.length).toEqual(2);
|
|
|
|
|
|
|
|
expect(seeds[0].scopeType).toEqual("custom");
|
|
|
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
|
|
|
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
|
|
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
|
|
|
|
|
|
expect(seeds[1].scopeType).toEqual("custom");
|
|
|
|
expect(seeds[1].url).toEqual("https://example.com/2");
|
|
|
|
expect(seeds[1].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
|
|
|
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
test("override scope", async () => {
|
|
|
|
const seeds = getSeeds(`
|
|
|
|
|
|
|
|
seeds:
|
|
|
|
- url: https://example.com/1
|
|
|
|
include: https://example.com/(path|other)
|
|
|
|
|
|
|
|
- https://example.com/2
|
|
|
|
|
|
|
|
- url: https://example.com/subpath/file.html
|
|
|
|
scopeType: prefix
|
|
|
|
|
|
|
|
include: https://example.com/onlythispath
|
|
|
|
`);
|
|
|
|
|
|
|
|
expect(seeds.length).toEqual(3);
|
|
|
|
|
|
|
|
expect(seeds[0].scopeType).toEqual("custom");
|
|
|
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
|
|
|
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
|
|
|
expect(seeds[0].exclude).toEqual([]);
|
|
|
|
|
|
|
|
expect(seeds[1].scopeType).toEqual("custom");
|
|
|
|
expect(seeds[1].url).toEqual("https://example.com/2");
|
|
|
|
expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
|
|
|
|
expect(seeds[1].exclude).toEqual([]);
|
|
|
|
|
|
|
|
expect(seeds[2].scopeType).toEqual("prefix");
|
|
|
|
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
|
|
|
expect(seeds[2].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
|
|
|
|
expect(seeds[2].exclude).toEqual([]);
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
test("override scope with exclude", async () => {
|
|
|
|
const seeds = getSeeds(`
|
|
|
|
|
|
|
|
seeds:
|
|
|
|
- url: https://example.com/1
|
|
|
|
scopeType: page
|
|
|
|
|
|
|
|
- url: https://example.com/subpath/file.html
|
|
|
|
scopeType: prefix
|
|
|
|
|
|
|
|
- url: https://example.com/2
|
|
|
|
scopeType: any
|
|
|
|
|
|
|
|
- url: https://example.com/3
|
|
|
|
scopeType: none
|
|
|
|
|
|
|
|
- url: https://example.com/4
|
|
|
|
scopeType: none
|
|
|
|
exclude: ''
|
|
|
|
|
|
|
|
exclude:
|
|
|
|
- /search\\?
|
|
|
|
- q\\?
|
|
|
|
|
|
|
|
`);
|
|
|
|
|
|
|
|
expect(seeds.length).toEqual(5);
|
|
|
|
const excludeRxs = [/\/search\?/, /q\?/];
|
|
|
|
|
|
|
|
expect(seeds[0].scopeType).toEqual("page");
|
|
|
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
|
|
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
|
|
|
|
expect(seeds[0].exclude).toEqual(excludeRxs);
|
|
|
|
|
|
|
|
expect(seeds[1].scopeType).toEqual("prefix");
|
|
|
|
expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
|
|
|
|
expect(seeds[1].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
|
|
|
|
expect(seeds[1].exclude).toEqual(excludeRxs);
|
|
|
|
|
|
|
|
expect(seeds[2].scopeType).toEqual("any");
|
|
|
|
expect(seeds[2].url).toEqual("https://example.com/2");
|
|
|
|
expect(seeds[2].include).toEqual([/.*/]);
|
|
|
|
expect(seeds[2].exclude).toEqual(excludeRxs);
|
|
|
|
|
|
|
|
expect(seeds[3].scopeType).toEqual("none");
|
|
|
|
expect(seeds[3].url).toEqual("https://example.com/3");
|
|
|
|
expect(seeds[3].include).toEqual([]);
|
|
|
|
expect(seeds[3].exclude).toEqual(excludeRxs);
|
|
|
|
|
|
|
|
expect(seeds[4].scopeType).toEqual("none");
|
|
|
|
expect(seeds[4].url).toEqual("https://example.com/4");
|
|
|
|
expect(seeds[4].include).toEqual([]);
|
|
|
|
expect(seeds[4].exclude).toEqual([]);
|
|
|
|
|
|
|
|
});
|
|
|
|
|