mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Scope Handling Improvements + Tests (#66)
* scope fixes: - remove default prefix scopeType, ensure scope include and exclude take precedence - add new 'custom' scopeType, when include or exclude are used - use --scopeIncludeRx and --scopeExcludeRx for better consistency for scope include and exclude (also allow --include/--exclude) - ensure per-seed scope include/exclude used when present, and scopeType set to 'custom' - ensure default scope is set to 'prefix' if no scopeType and no include/exclude regexes specified - rename --type to --scopeType in seed to maintain consistency - add sitemap param as alias for useSitemap tests: - add seed scope resolution tests for argParse, testing per-scope seed resolution, inheritance and overrides - fix screencaster to use relative paths to work with tests - ci: use yarn instead of npm * update README with new flags * bump version to 0.4.0-beta.3
This commit is contained in:
parent
ef7d5e50d8
commit
473de8c49f
9 changed files with 216 additions and 36 deletions
4
.github/workflows/ci.yaml
vendored
4
.github/workflows/ci.yaml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
|||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
- name: install requirements
|
||||
run: npm install
|
||||
run: yarn install
|
||||
- name: run linter
|
||||
run: yarn run eslint .
|
||||
|
||||
|
@ -37,7 +37,7 @@ jobs:
|
|||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
- name: install requirements
|
||||
run: npm install
|
||||
run: yarn install
|
||||
- name: build docker
|
||||
run: docker-compose build
|
||||
- name: run crawl
|
||||
|
|
16
README.md
16
README.md
|
@ -73,13 +73,15 @@ Options:
|
|||
[number] [default: 0]
|
||||
--timeout Timeout for each page to load (in
|
||||
seconds) [number] [default: 90]
|
||||
--scope Regex of page URLs that should be
|
||||
--scopeType Predefined for which URLs to crawl,
|
||||
can be: prefix, page, host, any, or
|
||||
custom, to use the
|
||||
scopeIncludeRx/scopeExcludeRx
|
||||
[string]
|
||||
--scopeIncludeRx, --include Regex of page URLs that should be
|
||||
included in the crawl (defaults to
|
||||
the immediate directory of URL)
|
||||
--scopeType Simplified scope for which URLs to
|
||||
crawl, can be: prefix, page, host,
|
||||
any [string] [default: "prefix"]
|
||||
--exclude Regex of page URLs that should be
|
||||
--scopeExcludeRx, --exclude Regex of page URLs that should be
|
||||
excluded from the crawl.
|
||||
--allowHashUrls Allow Hashtag URLs, useful for
|
||||
single-page-application crawling or
|
||||
|
@ -88,7 +90,7 @@ Options:
|
|||
-c, --collection Collection name to crawl to (replay
|
||||
will be accessible under this name
|
||||
in pywb preview)
|
||||
[string] [default: "capture-2021-06-26T19-38-10"]
|
||||
[string] [default: "capture-2021-07-01T23-43-26"]
|
||||
--headless Run in headless mode, otherwise
|
||||
start xvfb[boolean] [default: false]
|
||||
--driver JS driver for the crawler
|
||||
|
@ -122,7 +124,7 @@ Options:
|
|||
--userAgentSuffix Append suffix to existing browser
|
||||
user-agent (ex: +MyCrawler,
|
||||
info@example.com) [string]
|
||||
--useSitemap If enabled, check for sitemaps at
|
||||
--useSitemap, --sitemap If enabled, check for sitemaps at
|
||||
/sitemap.xml, or custom URL if URL
|
||||
is specified
|
||||
--statsFilename If set, output stats as JSON to this
|
||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
|||
|
||||
services:
|
||||
crawler:
|
||||
image: webrecorder/browsertrix-crawler:0.4.0-beta.2
|
||||
image: webrecorder/browsertrix-crawler:0.4.0-beta.3
|
||||
build:
|
||||
context: ./
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.4.0-beta.2",
|
||||
"version": "0.4.0-beta.3",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
|
@ -10,7 +10,7 @@ test("pass config file via stdin", async () => {
|
|||
|
||||
try {
|
||||
const version = require("../package.json").version;
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --exclude webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --scopeExcludeRx webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
|
|
166
tests/scopes.test.js
Normal file
166
tests/scopes.test.js
Normal file
|
@ -0,0 +1,166 @@
|
|||
const { parseArgs } = require("../util/argParser");
|
||||
|
||||
const fs = require("fs");
|
||||
|
||||
function getSeeds(config) {
|
||||
const orig = fs.readFileSync;
|
||||
|
||||
fs.readFileSync = (name, ...args) => {
|
||||
if (name.endsWith("/configtest")) {
|
||||
return config;
|
||||
}
|
||||
return orig(name, ...args);
|
||||
};
|
||||
|
||||
return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds;
|
||||
}
|
||||
|
||||
test("default scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
- https://example.com/
|
||||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([]);
|
||||
|
||||
});
|
||||
|
||||
test("custom scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
- url: https://example.com/
|
||||
include: https://example.com/(path|other)
|
||||
exclude: https://example.com/pathexclude
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("custom");
|
||||
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
});
|
||||
|
||||
|
||||
test("inherit scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
seeds:
|
||||
- url: https://example.com/1
|
||||
- url: https://example.com/2
|
||||
|
||||
include: https://example.com/(path|other)
|
||||
exclude: https://example.com/pathexclude
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(2);
|
||||
|
||||
expect(seeds[0].scopeType).toEqual("custom");
|
||||
expect(seeds[0].url).toEqual("https://example.com/1");
|
||||
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
expect(seeds[1].scopeType).toEqual("custom");
|
||||
expect(seeds[1].url).toEqual("https://example.com/2");
|
||||
expect(seeds[1].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("override scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
seeds:
|
||||
- url: https://example.com/1
|
||||
include: https://example.com/(path|other)
|
||||
|
||||
- https://example.com/2
|
||||
|
||||
- url: https://example.com/subpath/file.html
|
||||
scopeType: prefix
|
||||
|
||||
include: https://example.com/onlythispath
|
||||
`);
|
||||
|
||||
expect(seeds.length).toEqual(3);
|
||||
|
||||
expect(seeds[0].scopeType).toEqual("custom");
|
||||
expect(seeds[0].url).toEqual("https://example.com/1");
|
||||
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[0].exclude).toEqual([]);
|
||||
|
||||
expect(seeds[1].scopeType).toEqual("custom");
|
||||
expect(seeds[1].url).toEqual("https://example.com/2");
|
||||
expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
|
||||
expect(seeds[1].exclude).toEqual([]);
|
||||
|
||||
expect(seeds[2].scopeType).toEqual("prefix");
|
||||
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
||||
expect(seeds[2].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
|
||||
expect(seeds[2].exclude).toEqual([]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("override scope with exclude", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
seeds:
|
||||
- url: https://example.com/1
|
||||
scopeType: page
|
||||
|
||||
- url: https://example.com/subpath/file.html
|
||||
scopeType: prefix
|
||||
|
||||
- url: https://example.com/2
|
||||
scopeType: any
|
||||
|
||||
- url: https://example.com/3
|
||||
scopeType: none
|
||||
|
||||
- url: https://example.com/4
|
||||
scopeType: none
|
||||
exclude: ''
|
||||
|
||||
exclude:
|
||||
- /search\\?
|
||||
- q\\?
|
||||
|
||||
`);
|
||||
|
||||
expect(seeds.length).toEqual(5);
|
||||
const excludeRxs = [/\/search\?/, /q\?/];
|
||||
|
||||
expect(seeds[0].scopeType).toEqual("page");
|
||||
expect(seeds[0].url).toEqual("https://example.com/1");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
|
||||
expect(seeds[0].exclude).toEqual(excludeRxs);
|
||||
|
||||
expect(seeds[1].scopeType).toEqual("prefix");
|
||||
expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
|
||||
expect(seeds[1].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
|
||||
expect(seeds[1].exclude).toEqual(excludeRxs);
|
||||
|
||||
expect(seeds[2].scopeType).toEqual("any");
|
||||
expect(seeds[2].url).toEqual("https://example.com/2");
|
||||
expect(seeds[2].include).toEqual([/.*/]);
|
||||
expect(seeds[2].exclude).toEqual(excludeRxs);
|
||||
|
||||
expect(seeds[3].scopeType).toEqual("none");
|
||||
expect(seeds[3].url).toEqual("https://example.com/3");
|
||||
expect(seeds[3].include).toEqual([]);
|
||||
expect(seeds[3].exclude).toEqual(excludeRxs);
|
||||
|
||||
expect(seeds[4].scopeType).toEqual("none");
|
||||
expect(seeds[4].url).toEqual("https://example.com/4");
|
||||
expect(seeds[4].include).toEqual([]);
|
||||
expect(seeds[4].exclude).toEqual([]);
|
||||
|
||||
});
|
||||
|
|
@ -71,17 +71,18 @@ class ArgParser {
|
|||
type: "number",
|
||||
},
|
||||
|
||||
"scope": {
|
||||
"scopeType": {
|
||||
describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"scopeIncludeRx": {
|
||||
alias: "include",
|
||||
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||
},
|
||||
|
||||
"scopeType": {
|
||||
describe: "Simplified scope for which URLs to crawl, can be: prefix, page, host, any",
|
||||
type: "string",
|
||||
default: "prefix",
|
||||
},
|
||||
|
||||
"exclude": {
|
||||
"scopeExcludeRx": {
|
||||
alias: "exclude",
|
||||
describe: "Regex of page URLs that should be excluded from the crawl."
|
||||
},
|
||||
|
||||
|
@ -169,6 +170,7 @@ class ArgParser {
|
|||
},
|
||||
|
||||
"useSitemap": {
|
||||
alias: "sitemap",
|
||||
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||
},
|
||||
|
||||
|
@ -300,18 +302,21 @@ class ArgParser {
|
|||
}
|
||||
}
|
||||
|
||||
if (argv.include || argv.exclude) {
|
||||
if (argv.scopeType && argv.scopeType !== "custom") {
|
||||
console.warn("You've specified a --scopeType and a --scopeIncludeRx or --scopeExcludeRx regex. The custom scope regex will take precedence, overriding the scopeType");
|
||||
argv.scopeType = "custom";
|
||||
}
|
||||
}
|
||||
|
||||
const scopeOpts = {
|
||||
type: argv.scopeType,
|
||||
sitemap: argv.useSitemap,
|
||||
include: argv.scope,
|
||||
scopeType: argv.scopeType,
|
||||
sitemap: argv.sitemap,
|
||||
include: argv.include,
|
||||
exclude: argv.exclude,
|
||||
depth: argv.depth,
|
||||
};
|
||||
|
||||
if (argv.scope && argv.scopeType) {
|
||||
console.warn("You've specified a --scopeType and a --scope regex. The custom scope regex will take precedence, overriding the scopeType");
|
||||
}
|
||||
|
||||
argv.scopedSeeds = [];
|
||||
|
||||
for (let seed of argv.seeds) {
|
||||
|
|
|
@ -2,10 +2,11 @@ const ws = require("ws");
|
|||
const http = require("http");
|
||||
const url = require("url");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
|
||||
|
||||
const indexHTML = fs.readFileSync("/app/screencast/index.html", {encoding: "utf8"});
|
||||
const indexHTML = fs.readFileSync(path.join(__dirname, "..", "screencast", "index.html"), {encoding: "utf8"});
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
|
|
|
@ -1,14 +1,20 @@
|
|||
class ScopedSeed
|
||||
{
|
||||
constructor({url, type, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
|
||||
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
|
||||
const parsedUrl = this.parseUrl(url);
|
||||
this.url = parsedUrl.href;
|
||||
this.type = type;
|
||||
if (type) {
|
||||
[include, allowHash] = this.scopeFromType(type, parsedUrl);
|
||||
}
|
||||
this.include = this.parseRx(include);
|
||||
this.exclude = this.parseRx(exclude);
|
||||
|
||||
if (!scopeType) {
|
||||
scopeType = (this.include.length || this.exclude.length) ? "custom" : "prefix";
|
||||
}
|
||||
|
||||
this.scopeType = scopeType;
|
||||
|
||||
if (this.scopeType !== "custom") {
|
||||
[this.include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
|
||||
}
|
||||
this.sitemap = this.resolveSiteMap(sitemap);
|
||||
this.allowHash = allowHash;
|
||||
this.maxDepth = depth < 0 ? 99999 : depth;
|
||||
|
@ -44,11 +50,11 @@ class ScopedSeed
|
|||
return sitemap;
|
||||
}
|
||||
|
||||
scopeFromType(type, parsedUrl) {
|
||||
scopeFromType(scopeType, parsedUrl) {
|
||||
let include;
|
||||
let allowHash = false;
|
||||
|
||||
switch (type) {
|
||||
switch (scopeType) {
|
||||
case "page":
|
||||
// allow scheme-agnostic URLS as likely redirects
|
||||
include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")];
|
||||
|
@ -72,7 +78,7 @@ class ScopedSeed
|
|||
break;
|
||||
|
||||
default:
|
||||
throw new Error(`Invalid scope type "${type}" specified, valid types are: page, prefix, host`);
|
||||
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, prefix, host`);
|
||||
}
|
||||
|
||||
return [include, allowHash];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue