browsertrix-crawler/util/seeds.js
Ilya Kreymer 473de8c49f
Scope Handling Improvements + Tests (#66)
* scope fixes:
- remove default prefix scopeType, ensure scope include and exclude take precedence
- add new 'custom' scopeType, when include or exclude are used
- use --scopeIncludeRx and --scopeExcludeRx for better consistency for scope include and exclude (also allow --include/--exclude)
- ensure per-seed scope include/exclude used when present, and scopeType set to 'custom'
- ensure default scope is set to 'prefix' if no scopeType and no include/exclude regexes specified
- rename --type to --scopeType in seed to maintain consistency
- add sitemap param as alias for useSitemap

tests: 
- add seed scope resolution tests for argParse, testing per-scope seed resolution, inheritance and overrides
- fix screencaster to use relative paths to work with tests
- ci: use yarn instead of npm

* update README with new flags

* bump version to 0.4.0-beta.3
2021-07-06 20:22:27 -07:00

144 lines
3.1 KiB
JavaScript

class ScopedSeed
{
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
const parsedUrl = this.parseUrl(url);
this.url = parsedUrl.href;
this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude);
if (!scopeType) {
scopeType = (this.include.length || this.exclude.length) ? "custom" : "prefix";
}
this.scopeType = scopeType;
if (this.scopeType !== "custom") {
[this.include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
}
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxDepth = depth < 0 ? 99999 : depth;
}
parseRx(value) {
if (!value) {
return [];
} else if (typeof(value) === "string") {
return [new RegExp(value)];
} else {
return value.map(e => typeof(e) === "string" ? new RegExp(e) : e);
}
}
parseUrl(url) {
const parsedUrl = new URL(url);
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
throw new Error("URL must start with http:// or https://");
}
return parsedUrl;
}
resolveSiteMap(sitemap) {
if (sitemap === true) {
const url = new URL(this.url);
url.pathname = "/sitemap.xml";
return url.href;
}
return sitemap;
}
scopeFromType(scopeType, parsedUrl) {
let include;
let allowHash = false;
switch (scopeType) {
case "page":
// allow scheme-agnostic URLS as likely redirects
include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")];
allowHash = true;
break;
case "prefix":
include = [new RegExp("^" + rxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1)))];
break;
case "host":
include = [new RegExp("^" + rxEscape(parsedUrl.origin + "/"))];
break;
case "any":
include = [/.*/];
break;
case "none":
include = [];
break;
default:
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, prefix, host`);
}
return [include, allowHash];
}
isIncluded(url, depth) {
if (depth > this.maxDepth) {
return false;
}
try {
url = this.parseUrl(url);
} catch(e) {
return false;
}
if (!this.allowHash) {
// remove hashtag
url.hash = "";
}
url = url.href;
// skip already crawled
// if (this.seenList.has(url)) {
// return false;
//}
let inScope = false;
// check scopes
for (const s of this.include) {
if (s.exec(url)) {
inScope = true;
break;
}
}
if (!inScope) {
//console.log(`Not in scope ${url} ${this.include}`);
return false;
}
// check exclusions
for (const e of this.exclude) {
if (e.exec(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
}
}
return url;
}
}
function rxEscape(string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}
module.exports.ScopedSeed = ScopedSeed;
module.exports.rxEscape = rxEscape;