mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

treat non-regexes as strings and pass to RegExp constructor tests: add additional scope parsing tests for different types passed in as exclusions update yargs bump to 0.10.4
182 lines
4.3 KiB
JavaScript
182 lines
4.3 KiB
JavaScript
import { logger } from "./logger.js";
|
|
import { MAX_DEPTH } from "./constants.js";
|
|
|
|
|
|
export class ScopedSeed
|
|
{
|
|
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
|
|
const parsedUrl = this.parseUrl(url);
|
|
if (!parsedUrl) {
|
|
logger.fatal(`Invalid Seed "${url}" specified, aborting crawl.`);
|
|
}
|
|
this.url = parsedUrl.href;
|
|
this.include = this.parseRx(include);
|
|
this.exclude = this.parseRx(exclude);
|
|
this.scopeType = scopeType;
|
|
|
|
if (!this.scopeType) {
|
|
this.scopeType = this.include.length ? "custom" : "prefix";
|
|
}
|
|
|
|
if (this.scopeType !== "custom") {
|
|
[include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
|
|
this.include = [...include, ...this.include];
|
|
}
|
|
|
|
this.sitemap = this.resolveSiteMap(sitemap);
|
|
this.allowHash = allowHash;
|
|
this.maxExtraHops = extraHops;
|
|
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
|
|
}
|
|
|
|
parseRx(value) {
|
|
if (value === null || value === undefined || value === "") {
|
|
return [];
|
|
} else if (!(value instanceof Array)) {
|
|
return [new RegExp(value)];
|
|
} else {
|
|
return value.map(e => (e instanceof RegExp) ? e : new RegExp(e));
|
|
}
|
|
}
|
|
|
|
parseUrl(url, logDetails = {}) {
|
|
let parsedUrl = null;
|
|
try {
|
|
parsedUrl = new URL(url.trim());
|
|
} catch (e) {
|
|
logger.warn("Invalid Page - not a valid URL", {url, ...logDetails});
|
|
return null;
|
|
}
|
|
|
|
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
|
logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails});
|
|
parsedUrl = null;
|
|
}
|
|
|
|
return parsedUrl;
|
|
}
|
|
|
|
resolveSiteMap(sitemap) {
|
|
if (sitemap === true) {
|
|
const url = new URL(this.url);
|
|
url.pathname = "/sitemap.xml";
|
|
return url.href;
|
|
}
|
|
|
|
return sitemap;
|
|
}
|
|
|
|
scopeFromType(scopeType, parsedUrl) {
|
|
let include;
|
|
let allowHash = false;
|
|
|
|
switch (scopeType) {
|
|
case "page":
|
|
include = [];
|
|
break;
|
|
|
|
case "page-spa":
|
|
// allow scheme-agnostic URLS as likely redirects
|
|
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
|
|
allowHash = true;
|
|
break;
|
|
|
|
case "prefix":
|
|
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
|
|
break;
|
|
|
|
case "host":
|
|
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
|
|
break;
|
|
|
|
case "domain":
|
|
if (parsedUrl.hostname.startsWith("www.")) {
|
|
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
|
}
|
|
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
|
|
break;
|
|
|
|
case "any":
|
|
include = [/.*/];
|
|
break;
|
|
|
|
default:
|
|
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
|
|
}
|
|
|
|
return [include, allowHash];
|
|
}
|
|
|
|
isAtMaxDepth(depth) {
|
|
return depth >= this.maxDepth;
|
|
}
|
|
|
|
isIncluded(url, depth, extraHops = 0, logDetails = {}) {
|
|
if (depth > this.maxDepth) {
|
|
return false;
|
|
}
|
|
|
|
url = this.parseUrl(url, logDetails);
|
|
if (!url) {
|
|
return false;
|
|
}
|
|
|
|
if (!this.allowHash) {
|
|
// remove hashtag
|
|
url.hash = "";
|
|
}
|
|
|
|
url = url.href;
|
|
|
|
if (url === this.url) {
|
|
return true;
|
|
}
|
|
|
|
// skip already crawled
|
|
// if (this.seenList.has(url)) {
|
|
// return false;
|
|
//}
|
|
let inScope = false;
|
|
|
|
// check scopes
|
|
for (const s of this.include) {
|
|
if (s.test(url)) {
|
|
inScope = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
let isOOS = false;
|
|
|
|
if (!inScope) {
|
|
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
|
isOOS = true;
|
|
} else {
|
|
//console.log(`Not in scope ${url} ${this.include}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// check exclusions
|
|
for (const e of this.exclude) {
|
|
if (e.test(url)) {
|
|
//console.log(`Skipping ${url} excluded by ${e}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return {url, isOOS};
|
|
}
|
|
}
|
|
|
|
export function rxEscape(string) {
|
|
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
}
|
|
|
|
export function urlRxEscape(url, parsedUrl) {
|
|
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
|
}
|
|
|
|
|
|
|
|
|