Scope Handling Improvements + Tests (#66)

* scope fixes:
- remove default prefix scopeType, ensure scope include and exclude take precedence
- add new 'custom' scopeType, when include or exclude are used
- use --scopeIncludeRx and --scopeExcludeRx for better consistency for scope include and exclude (also allow --include/--exclude)
- ensure per-seed scope include/exclude used when present, and scopeType set to 'custom'
- ensure default scope is set to 'prefix' if no scopeType and no include/exclude regexes specified
- rename --type to --scopeType in seed to maintain consistency
- add sitemap param as alias for useSitemap

tests: 
- add seed scope resolution tests for argParse, testing per-scope seed resolution, inheritance and overrides
- fix screencaster to use relative paths to work with tests
- ci: use yarn instead of npm

* update README with new flags

* bump version to 0.4.0-beta.3
This commit is contained in:
Ilya Kreymer 2021-07-06 20:22:27 -07:00 committed by GitHub
parent ef7d5e50d8
commit 473de8c49f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 216 additions and 36 deletions

View file

@ -18,7 +18,7 @@ jobs:
with:
node-version: ${{ matrix.node-version }}
- name: install requirements
run: npm install
run: yarn install
- name: run linter
run: yarn run eslint .
@ -37,7 +37,7 @@ jobs:
with:
node-version: ${{ matrix.node-version }}
- name: install requirements
run: npm install
run: yarn install
- name: build docker
run: docker-compose build
- name: run crawl

View file

@ -73,13 +73,15 @@ Options:
[number] [default: 0]
--timeout Timeout for each page to load (in
seconds) [number] [default: 90]
--scope Regex of page URLs that should be
--scopeType Predefined for which URLs to crawl,
can be: prefix, page, host, any, or
custom, to use the
scopeIncludeRx/scopeExcludeRx
[string]
--scopeIncludeRx, --include Regex of page URLs that should be
included in the crawl (defaults to
the immediate directory of URL)
--scopeType Simplified scope for which URLs to
crawl, can be: prefix, page, host,
any [string] [default: "prefix"]
--exclude Regex of page URLs that should be
--scopeExcludeRx, --exclude Regex of page URLs that should be
excluded from the crawl.
--allowHashUrls Allow Hashtag URLs, useful for
single-page-application crawling or
@ -88,7 +90,7 @@ Options:
-c, --collection Collection name to crawl to (replay
will be accessible under this name
in pywb preview)
[string] [default: "capture-2021-06-26T19-38-10"]
[string] [default: "capture-2021-07-01T23-43-26"]
--headless Run in headless mode, otherwise
start xvfb[boolean] [default: false]
--driver JS driver for the crawler
@ -122,7 +124,7 @@ Options:
--userAgentSuffix Append suffix to existing browser
user-agent (ex: +MyCrawler,
info@example.com) [string]
--useSitemap If enabled, check for sitemaps at
--useSitemap, --sitemap If enabled, check for sitemaps at
/sitemap.xml, or custom URL if URL
is specified
--statsFilename If set, output stats as JSON to this

View file

@ -2,7 +2,7 @@ version: '3.5'
services:
crawler:
image: webrecorder/browsertrix-crawler:0.4.0-beta.2
image: webrecorder/browsertrix-crawler:0.4.0-beta.3
build:
context: ./

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.4.0-beta.2",
"version": "0.4.0-beta.3",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",

View file

@ -10,7 +10,7 @@ test("pass config file via stdin", async () => {
try {
const version = require("../package.json").version;
const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --exclude webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --scopeExcludeRx webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
console.log(proc);
}

166
tests/scopes.test.js Normal file
View file

@ -0,0 +1,166 @@
const { parseArgs } = require("../util/argParser");
const fs = require("fs");
function getSeeds(config) {
const orig = fs.readFileSync;
fs.readFileSync = (name, ...args) => {
if (name.endsWith("/configtest")) {
return config;
}
return orig(name, ...args);
};
return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds;
}
test("default scope", async () => {
const seeds = getSeeds(`
seeds:
- https://example.com/
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([]);
});
test("custom scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/
include: https://example.com/(path|other)
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("inherit scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
- url: https://example.com/2
include: https://example.com/(path|other)
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(2);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
expect(seeds[1].scopeType).toEqual("custom");
expect(seeds[1].url).toEqual("https://example.com/2");
expect(seeds[1].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("override scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
include: https://example.com/(path|other)
- https://example.com/2
- url: https://example.com/subpath/file.html
scopeType: prefix
include: https://example.com/onlythispath
`);
expect(seeds.length).toEqual(3);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([]);
expect(seeds[1].scopeType).toEqual("custom");
expect(seeds[1].url).toEqual("https://example.com/2");
expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
expect(seeds[1].exclude).toEqual([]);
expect(seeds[2].scopeType).toEqual("prefix");
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
expect(seeds[2].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
expect(seeds[2].exclude).toEqual([]);
});
test("override scope with exclude", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
scopeType: page
- url: https://example.com/subpath/file.html
scopeType: prefix
- url: https://example.com/2
scopeType: any
- url: https://example.com/3
scopeType: none
- url: https://example.com/4
scopeType: none
exclude: ''
exclude:
- /search\\?
- q\\?
`);
expect(seeds.length).toEqual(5);
const excludeRxs = [/\/search\?/, /q\?/];
expect(seeds[0].scopeType).toEqual("page");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
expect(seeds[0].exclude).toEqual(excludeRxs);
expect(seeds[1].scopeType).toEqual("prefix");
expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
expect(seeds[1].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
expect(seeds[1].exclude).toEqual(excludeRxs);
expect(seeds[2].scopeType).toEqual("any");
expect(seeds[2].url).toEqual("https://example.com/2");
expect(seeds[2].include).toEqual([/.*/]);
expect(seeds[2].exclude).toEqual(excludeRxs);
expect(seeds[3].scopeType).toEqual("none");
expect(seeds[3].url).toEqual("https://example.com/3");
expect(seeds[3].include).toEqual([]);
expect(seeds[3].exclude).toEqual(excludeRxs);
expect(seeds[4].scopeType).toEqual("none");
expect(seeds[4].url).toEqual("https://example.com/4");
expect(seeds[4].include).toEqual([]);
expect(seeds[4].exclude).toEqual([]);
});

View file

@ -71,17 +71,18 @@ class ArgParser {
type: "number",
},
"scope": {
"scopeType": {
describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
type: "string",
},
"scopeIncludeRx": {
alias: "include",
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
},
"scopeType": {
describe: "Simplified scope for which URLs to crawl, can be: prefix, page, host, any",
type: "string",
default: "prefix",
},
"exclude": {
"scopeExcludeRx": {
alias: "exclude",
describe: "Regex of page URLs that should be excluded from the crawl."
},
@ -169,6 +170,7 @@ class ArgParser {
},
"useSitemap": {
alias: "sitemap",
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
},
@ -300,18 +302,21 @@ class ArgParser {
}
}
if (argv.include || argv.exclude) {
if (argv.scopeType && argv.scopeType !== "custom") {
console.warn("You've specified a --scopeType and a --scopeIncludeRx or --scopeExcludeRx regex. The custom scope regex will take precedence, overriding the scopeType");
argv.scopeType = "custom";
}
}
const scopeOpts = {
type: argv.scopeType,
sitemap: argv.useSitemap,
include: argv.scope,
scopeType: argv.scopeType,
sitemap: argv.sitemap,
include: argv.include,
exclude: argv.exclude,
depth: argv.depth,
};
if (argv.scope && argv.scopeType) {
console.warn("You've specified a --scopeType and a --scope regex. The custom scope regex will take precedence, overriding the scopeType");
}
argv.scopedSeeds = [];
for (let seed of argv.seeds) {

View file

@ -2,10 +2,11 @@ const ws = require("ws");
const http = require("http");
const url = require("url");
const fs = require("fs");
const path = require("path");
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
const indexHTML = fs.readFileSync("/app/screencast/index.html", {encoding: "utf8"});
const indexHTML = fs.readFileSync(path.join(__dirname, "..", "screencast", "index.html"), {encoding: "utf8"});
// ===========================================================================

View file

@ -1,14 +1,20 @@
class ScopedSeed
{
constructor({url, type, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
const parsedUrl = this.parseUrl(url);
this.url = parsedUrl.href;
this.type = type;
if (type) {
[include, allowHash] = this.scopeFromType(type, parsedUrl);
}
this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude);
if (!scopeType) {
scopeType = (this.include.length || this.exclude.length) ? "custom" : "prefix";
}
this.scopeType = scopeType;
if (this.scopeType !== "custom") {
[this.include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
}
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxDepth = depth < 0 ? 99999 : depth;
@ -44,11 +50,11 @@ class ScopedSeed
return sitemap;
}
scopeFromType(type, parsedUrl) {
scopeFromType(scopeType, parsedUrl) {
let include;
let allowHash = false;
switch (type) {
switch (scopeType) {
case "page":
// allow scheme-agnostic URLS as likely redirects
include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")];
@ -72,7 +78,7 @@ class ScopedSeed
break;
default:
throw new Error(`Invalid scope type "${type}" specified, valid types are: page, prefix, host`);
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, prefix, host`);
}
return [include, allowHash];