browsertrix-crawler/util/seeds.js
Ilya Kreymer 277314f2de Convert to ESM (#179)
* switch base image to chrome/chromium 105 with node 18.x
* convert all source to esm for node 18.x, remove unneeded node-fetch dependency
* ci: use node 18.x, update to latest actions
* tests: convert to esm, run with --experimental-vm-modules
* tests: set higher default timeout (90s) for all tests
* tests: rename driver test fixture to .mjs for loading in jest
* bump to 0.8.0
2022-11-15 18:30:27 -08:00

173 lines
4 KiB
JavaScript

export class ScopedSeed
{
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
const parsedUrl = this.parseUrl(url);
this.url = parsedUrl.href;
this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude);
this.scopeType = scopeType;
if (!this.scopeType) {
this.scopeType = this.include.length ? "custom" : "prefix";
}
if (this.scopeType !== "custom") {
[this.include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
}
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;
this.maxDepth = depth < 0 ? 99999 : depth;
}
parseRx(value) {
if (!value) {
return [];
} else if (typeof(value) === "string") {
return [new RegExp(value)];
} else {
return value.map(e => typeof(e) === "string" ? new RegExp(e) : e);
}
}
parseUrl(url) {
let parsedUrl = null;
try {
parsedUrl = new URL(url.trim());
} catch (e) {
throw new Error(`Invalid Seed "${url}" - not a valid URL`);
}
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
throw new Error(`Invalid Seed "${url}" - URL must start with http:// or https://`);
}
return parsedUrl;
}
resolveSiteMap(sitemap) {
if (sitemap === true) {
const url = new URL(this.url);
url.pathname = "/sitemap.xml";
return url.href;
}
return sitemap;
}
scopeFromType(scopeType, parsedUrl) {
let include;
let allowHash = false;
switch (scopeType) {
case "page":
include = [];
break;
case "page-spa":
// allow scheme-agnostic URLS as likely redirects
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
allowHash = true;
break;
case "prefix":
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
break;
case "host":
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
break;
case "domain":
if (parsedUrl.hostname.startsWith("www.")) {
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
}
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
break;
case "any":
include = [/.*/];
break;
default:
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
}
return [include, allowHash];
}
isAtMaxDepth(depth) {
return depth >= this.maxDepth;
}
isIncluded(url, depth, extraHops = 0) {
if (depth > this.maxDepth) {
return false;
}
try {
url = this.parseUrl(url);
} catch(e) {
return false;
}
if (!this.allowHash) {
// remove hashtag
url.hash = "";
}
url = url.href;
if (url === this.url) {
return true;
}
// skip already crawled
// if (this.seenList.has(url)) {
// return false;
//}
let inScope = false;
// check scopes
for (const s of this.include) {
if (s.exec(url)) {
inScope = true;
break;
}
}
let isOOS = false;
if (!inScope) {
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
isOOS = true;
} else {
//console.log(`Not in scope ${url} ${this.include}`);
return false;
}
}
// check exclusions
for (const e of this.exclude) {
if (e.exec(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
}
}
return {url, isOOS};
}
}
export function rxEscape(string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}
export function urlRxEscape(url, parsedUrl) {
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
}