mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Scope Handling Improvements + Tests (#66)
* scope fixes: - remove default prefix scopeType, ensure scope include and exclude take precedence - add new 'custom' scopeType, when include or exclude are used - use --scopeIncludeRx and --scopeExcludeRx for better consistency for scope include and exclude (also allow --include/--exclude) - ensure per-seed scope include/exclude used when present, and scopeType set to 'custom' - ensure default scope is set to 'prefix' if no scopeType and no include/exclude regexes specified - rename --type to --scopeType in seed to maintain consistency - add sitemap param as alias for useSitemap tests: - add seed scope resolution tests for argParse, testing per-scope seed resolution, inheritance and overrides - fix screencaster to use relative paths to work with tests - ci: use yarn instead of npm * update README with new flags * bump version to 0.4.0-beta.3
This commit is contained in:
parent
ef7d5e50d8
commit
473de8c49f
9 changed files with 216 additions and 36 deletions
4
.github/workflows/ci.yaml
vendored
4
.github/workflows/ci.yaml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
- name: install requirements
|
- name: install requirements
|
||||||
run: npm install
|
run: yarn install
|
||||||
- name: run linter
|
- name: run linter
|
||||||
run: yarn run eslint .
|
run: yarn run eslint .
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
- name: install requirements
|
- name: install requirements
|
||||||
run: npm install
|
run: yarn install
|
||||||
- name: build docker
|
- name: build docker
|
||||||
run: docker-compose build
|
run: docker-compose build
|
||||||
- name: run crawl
|
- name: run crawl
|
||||||
|
|
16
README.md
16
README.md
|
@ -73,13 +73,15 @@ Options:
|
||||||
[number] [default: 0]
|
[number] [default: 0]
|
||||||
--timeout Timeout for each page to load (in
|
--timeout Timeout for each page to load (in
|
||||||
seconds) [number] [default: 90]
|
seconds) [number] [default: 90]
|
||||||
--scope Regex of page URLs that should be
|
--scopeType Predefined for which URLs to crawl,
|
||||||
|
can be: prefix, page, host, any, or
|
||||||
|
custom, to use the
|
||||||
|
scopeIncludeRx/scopeExcludeRx
|
||||||
|
[string]
|
||||||
|
--scopeIncludeRx, --include Regex of page URLs that should be
|
||||||
included in the crawl (defaults to
|
included in the crawl (defaults to
|
||||||
the immediate directory of URL)
|
the immediate directory of URL)
|
||||||
--scopeType Simplified scope for which URLs to
|
--scopeExcludeRx, --exclude Regex of page URLs that should be
|
||||||
crawl, can be: prefix, page, host,
|
|
||||||
any [string] [default: "prefix"]
|
|
||||||
--exclude Regex of page URLs that should be
|
|
||||||
excluded from the crawl.
|
excluded from the crawl.
|
||||||
--allowHashUrls Allow Hashtag URLs, useful for
|
--allowHashUrls Allow Hashtag URLs, useful for
|
||||||
single-page-application crawling or
|
single-page-application crawling or
|
||||||
|
@ -88,7 +90,7 @@ Options:
|
||||||
-c, --collection Collection name to crawl to (replay
|
-c, --collection Collection name to crawl to (replay
|
||||||
will be accessible under this name
|
will be accessible under this name
|
||||||
in pywb preview)
|
in pywb preview)
|
||||||
[string] [default: "capture-2021-06-26T19-38-10"]
|
[string] [default: "capture-2021-07-01T23-43-26"]
|
||||||
--headless Run in headless mode, otherwise
|
--headless Run in headless mode, otherwise
|
||||||
start xvfb[boolean] [default: false]
|
start xvfb[boolean] [default: false]
|
||||||
--driver JS driver for the crawler
|
--driver JS driver for the crawler
|
||||||
|
@ -122,7 +124,7 @@ Options:
|
||||||
--userAgentSuffix Append suffix to existing browser
|
--userAgentSuffix Append suffix to existing browser
|
||||||
user-agent (ex: +MyCrawler,
|
user-agent (ex: +MyCrawler,
|
||||||
info@example.com) [string]
|
info@example.com) [string]
|
||||||
--useSitemap If enabled, check for sitemaps at
|
--useSitemap, --sitemap If enabled, check for sitemaps at
|
||||||
/sitemap.xml, or custom URL if URL
|
/sitemap.xml, or custom URL if URL
|
||||||
is specified
|
is specified
|
||||||
--statsFilename If set, output stats as JSON to this
|
--statsFilename If set, output stats as JSON to this
|
||||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
crawler:
|
crawler:
|
||||||
image: webrecorder/browsertrix-crawler:0.4.0-beta.2
|
image: webrecorder/browsertrix-crawler:0.4.0-beta.3
|
||||||
build:
|
build:
|
||||||
context: ./
|
context: ./
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.4.0-beta.2",
|
"version": "0.4.0-beta.3",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
|
|
|
@ -10,7 +10,7 @@ test("pass config file via stdin", async () => {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const version = require("../package.json").version;
|
const version = require("../package.json").version;
|
||||||
const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --exclude webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --scopeExcludeRx webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||||
|
|
||||||
console.log(proc);
|
console.log(proc);
|
||||||
}
|
}
|
||||||
|
|
166
tests/scopes.test.js
Normal file
166
tests/scopes.test.js
Normal file
|
@ -0,0 +1,166 @@
|
||||||
|
const { parseArgs } = require("../util/argParser");
|
||||||
|
|
||||||
|
const fs = require("fs");
|
||||||
|
|
||||||
|
function getSeeds(config) {
|
||||||
|
const orig = fs.readFileSync;
|
||||||
|
|
||||||
|
fs.readFileSync = (name, ...args) => {
|
||||||
|
if (name.endsWith("/configtest")) {
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
return orig(name, ...args);
|
||||||
|
};
|
||||||
|
|
||||||
|
return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds;
|
||||||
|
}
|
||||||
|
|
||||||
|
test("default scope", async () => {
|
||||||
|
const seeds = getSeeds(`
|
||||||
|
seeds:
|
||||||
|
- https://example.com/
|
||||||
|
|
||||||
|
`);
|
||||||
|
|
||||||
|
|
||||||
|
expect(seeds.length).toEqual(1);
|
||||||
|
expect(seeds[0].scopeType).toEqual("prefix");
|
||||||
|
expect(seeds[0].include).toEqual([/^https:\/\/example\.com\//]);
|
||||||
|
expect(seeds[0].exclude).toEqual([]);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
test("custom scope", async () => {
|
||||||
|
const seeds = getSeeds(`
|
||||||
|
seeds:
|
||||||
|
- url: https://example.com/
|
||||||
|
include: https://example.com/(path|other)
|
||||||
|
exclude: https://example.com/pathexclude
|
||||||
|
`);
|
||||||
|
|
||||||
|
|
||||||
|
expect(seeds.length).toEqual(1);
|
||||||
|
expect(seeds[0].scopeType).toEqual("custom");
|
||||||
|
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||||
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("inherit scope", async () => {
|
||||||
|
const seeds = getSeeds(`
|
||||||
|
|
||||||
|
seeds:
|
||||||
|
- url: https://example.com/1
|
||||||
|
- url: https://example.com/2
|
||||||
|
|
||||||
|
include: https://example.com/(path|other)
|
||||||
|
exclude: https://example.com/pathexclude
|
||||||
|
`);
|
||||||
|
|
||||||
|
|
||||||
|
expect(seeds.length).toEqual(2);
|
||||||
|
|
||||||
|
expect(seeds[0].scopeType).toEqual("custom");
|
||||||
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
||||||
|
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||||
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||||
|
|
||||||
|
expect(seeds[1].scopeType).toEqual("custom");
|
||||||
|
expect(seeds[1].url).toEqual("https://example.com/2");
|
||||||
|
expect(seeds[1].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||||
|
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("override scope", async () => {
|
||||||
|
const seeds = getSeeds(`
|
||||||
|
|
||||||
|
seeds:
|
||||||
|
- url: https://example.com/1
|
||||||
|
include: https://example.com/(path|other)
|
||||||
|
|
||||||
|
- https://example.com/2
|
||||||
|
|
||||||
|
- url: https://example.com/subpath/file.html
|
||||||
|
scopeType: prefix
|
||||||
|
|
||||||
|
include: https://example.com/onlythispath
|
||||||
|
`);
|
||||||
|
|
||||||
|
expect(seeds.length).toEqual(3);
|
||||||
|
|
||||||
|
expect(seeds[0].scopeType).toEqual("custom");
|
||||||
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
||||||
|
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
|
||||||
|
expect(seeds[0].exclude).toEqual([]);
|
||||||
|
|
||||||
|
expect(seeds[1].scopeType).toEqual("custom");
|
||||||
|
expect(seeds[1].url).toEqual("https://example.com/2");
|
||||||
|
expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
|
||||||
|
expect(seeds[1].exclude).toEqual([]);
|
||||||
|
|
||||||
|
expect(seeds[2].scopeType).toEqual("prefix");
|
||||||
|
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
||||||
|
expect(seeds[2].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
|
||||||
|
expect(seeds[2].exclude).toEqual([]);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("override scope with exclude", async () => {
|
||||||
|
const seeds = getSeeds(`
|
||||||
|
|
||||||
|
seeds:
|
||||||
|
- url: https://example.com/1
|
||||||
|
scopeType: page
|
||||||
|
|
||||||
|
- url: https://example.com/subpath/file.html
|
||||||
|
scopeType: prefix
|
||||||
|
|
||||||
|
- url: https://example.com/2
|
||||||
|
scopeType: any
|
||||||
|
|
||||||
|
- url: https://example.com/3
|
||||||
|
scopeType: none
|
||||||
|
|
||||||
|
- url: https://example.com/4
|
||||||
|
scopeType: none
|
||||||
|
exclude: ''
|
||||||
|
|
||||||
|
exclude:
|
||||||
|
- /search\\?
|
||||||
|
- q\\?
|
||||||
|
|
||||||
|
`);
|
||||||
|
|
||||||
|
expect(seeds.length).toEqual(5);
|
||||||
|
const excludeRxs = [/\/search\?/, /q\?/];
|
||||||
|
|
||||||
|
expect(seeds[0].scopeType).toEqual("page");
|
||||||
|
expect(seeds[0].url).toEqual("https://example.com/1");
|
||||||
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
|
||||||
|
expect(seeds[0].exclude).toEqual(excludeRxs);
|
||||||
|
|
||||||
|
expect(seeds[1].scopeType).toEqual("prefix");
|
||||||
|
expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
|
||||||
|
expect(seeds[1].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
|
||||||
|
expect(seeds[1].exclude).toEqual(excludeRxs);
|
||||||
|
|
||||||
|
expect(seeds[2].scopeType).toEqual("any");
|
||||||
|
expect(seeds[2].url).toEqual("https://example.com/2");
|
||||||
|
expect(seeds[2].include).toEqual([/.*/]);
|
||||||
|
expect(seeds[2].exclude).toEqual(excludeRxs);
|
||||||
|
|
||||||
|
expect(seeds[3].scopeType).toEqual("none");
|
||||||
|
expect(seeds[3].url).toEqual("https://example.com/3");
|
||||||
|
expect(seeds[3].include).toEqual([]);
|
||||||
|
expect(seeds[3].exclude).toEqual(excludeRxs);
|
||||||
|
|
||||||
|
expect(seeds[4].scopeType).toEqual("none");
|
||||||
|
expect(seeds[4].url).toEqual("https://example.com/4");
|
||||||
|
expect(seeds[4].include).toEqual([]);
|
||||||
|
expect(seeds[4].exclude).toEqual([]);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
|
@ -71,17 +71,18 @@ class ArgParser {
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"scope": {
|
"scopeType": {
|
||||||
|
describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
|
||||||
|
"scopeIncludeRx": {
|
||||||
|
alias: "include",
|
||||||
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||||
},
|
},
|
||||||
|
|
||||||
"scopeType": {
|
"scopeExcludeRx": {
|
||||||
describe: "Simplified scope for which URLs to crawl, can be: prefix, page, host, any",
|
alias: "exclude",
|
||||||
type: "string",
|
|
||||||
default: "prefix",
|
|
||||||
},
|
|
||||||
|
|
||||||
"exclude": {
|
|
||||||
describe: "Regex of page URLs that should be excluded from the crawl."
|
describe: "Regex of page URLs that should be excluded from the crawl."
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -169,6 +170,7 @@ class ArgParser {
|
||||||
},
|
},
|
||||||
|
|
||||||
"useSitemap": {
|
"useSitemap": {
|
||||||
|
alias: "sitemap",
|
||||||
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -300,18 +302,21 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv.include || argv.exclude) {
|
||||||
|
if (argv.scopeType && argv.scopeType !== "custom") {
|
||||||
|
console.warn("You've specified a --scopeType and a --scopeIncludeRx or --scopeExcludeRx regex. The custom scope regex will take precedence, overriding the scopeType");
|
||||||
|
argv.scopeType = "custom";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const scopeOpts = {
|
const scopeOpts = {
|
||||||
type: argv.scopeType,
|
scopeType: argv.scopeType,
|
||||||
sitemap: argv.useSitemap,
|
sitemap: argv.sitemap,
|
||||||
include: argv.scope,
|
include: argv.include,
|
||||||
exclude: argv.exclude,
|
exclude: argv.exclude,
|
||||||
depth: argv.depth,
|
depth: argv.depth,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (argv.scope && argv.scopeType) {
|
|
||||||
console.warn("You've specified a --scopeType and a --scope regex. The custom scope regex will take precedence, overriding the scopeType");
|
|
||||||
}
|
|
||||||
|
|
||||||
argv.scopedSeeds = [];
|
argv.scopedSeeds = [];
|
||||||
|
|
||||||
for (let seed of argv.seeds) {
|
for (let seed of argv.seeds) {
|
||||||
|
|
|
@ -2,10 +2,11 @@ const ws = require("ws");
|
||||||
const http = require("http");
|
const http = require("http");
|
||||||
const url = require("url");
|
const url = require("url");
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
|
const path = require("path");
|
||||||
|
|
||||||
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
|
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;
|
||||||
|
|
||||||
const indexHTML = fs.readFileSync("/app/screencast/index.html", {encoding: "utf8"});
|
const indexHTML = fs.readFileSync(path.join(__dirname, "..", "screencast", "index.html"), {encoding: "utf8"});
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
|
|
|
@ -1,14 +1,20 @@
|
||||||
class ScopedSeed
|
class ScopedSeed
|
||||||
{
|
{
|
||||||
constructor({url, type, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
|
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
|
||||||
const parsedUrl = this.parseUrl(url);
|
const parsedUrl = this.parseUrl(url);
|
||||||
this.url = parsedUrl.href;
|
this.url = parsedUrl.href;
|
||||||
this.type = type;
|
|
||||||
if (type) {
|
|
||||||
[include, allowHash] = this.scopeFromType(type, parsedUrl);
|
|
||||||
}
|
|
||||||
this.include = this.parseRx(include);
|
this.include = this.parseRx(include);
|
||||||
this.exclude = this.parseRx(exclude);
|
this.exclude = this.parseRx(exclude);
|
||||||
|
|
||||||
|
if (!scopeType) {
|
||||||
|
scopeType = (this.include.length || this.exclude.length) ? "custom" : "prefix";
|
||||||
|
}
|
||||||
|
|
||||||
|
this.scopeType = scopeType;
|
||||||
|
|
||||||
|
if (this.scopeType !== "custom") {
|
||||||
|
[this.include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
|
||||||
|
}
|
||||||
this.sitemap = this.resolveSiteMap(sitemap);
|
this.sitemap = this.resolveSiteMap(sitemap);
|
||||||
this.allowHash = allowHash;
|
this.allowHash = allowHash;
|
||||||
this.maxDepth = depth < 0 ? 99999 : depth;
|
this.maxDepth = depth < 0 ? 99999 : depth;
|
||||||
|
@ -44,11 +50,11 @@ class ScopedSeed
|
||||||
return sitemap;
|
return sitemap;
|
||||||
}
|
}
|
||||||
|
|
||||||
scopeFromType(type, parsedUrl) {
|
scopeFromType(scopeType, parsedUrl) {
|
||||||
let include;
|
let include;
|
||||||
let allowHash = false;
|
let allowHash = false;
|
||||||
|
|
||||||
switch (type) {
|
switch (scopeType) {
|
||||||
case "page":
|
case "page":
|
||||||
// allow scheme-agnostic URLS as likely redirects
|
// allow scheme-agnostic URLS as likely redirects
|
||||||
include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")];
|
include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")];
|
||||||
|
@ -72,7 +78,7 @@ class ScopedSeed
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw new Error(`Invalid scope type "${type}" specified, valid types are: page, prefix, host`);
|
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, prefix, host`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return [include, allowHash];
|
return [include, allowHash];
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue