browsertrix-crawler/src/util/seeds.ts
Emma Segal-Grossman 2a49406df7
Add Prettier to the repo, and format all the files! (#428)
This adds prettier to the repo, and sets up the pre-commit hook to
auto-format as well as lint.
Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed.
2023-11-09 16:11:11 -08:00

279 lines
6.3 KiB
TypeScript

import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js";
type ScopeType =
| "prefix"
| "host"
| "domain"
| "page"
| "page-spa"
| "any"
| "custom";
export class ScopedSeed {
url: string;
scopeType: ScopeType;
include: RegExp[];
exclude: RegExp[] = [];
allowHash = false;
depth = -1;
sitemap?: string | null;
extraHops = 0;
maxExtraHops = 0;
maxDepth = 0;
constructor({
url,
scopeType,
include,
exclude = [],
allowHash = false,
depth = -1,
sitemap = false,
extraHops = 0,
}: {
url: string;
scopeType: ScopeType;
include: string[];
exclude?: string[];
allowHash?: boolean;
depth?: number;
sitemap?: string | boolean | null;
extraHops?: number;
}) {
const parsedUrl = this.parseUrl(url);
if (!parsedUrl) {
throw new Error("Invalid URL");
}
this.url = parsedUrl.href;
this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude);
this.scopeType = scopeType;
if (!this.scopeType) {
this.scopeType = this.include.length ? "custom" : "prefix";
}
if (this.scopeType !== "custom") {
const [includeNew, allowHashNew] = this.scopeFromType(
this.scopeType,
parsedUrl,
);
this.include = [...includeNew, ...this.include];
allowHash = allowHashNew;
}
// for page scope, the depth is set to extraHops, as no other
// crawling is done
if (this.scopeType === "page") {
depth = extraHops;
}
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
}
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
parseRx(value: any) {
if (value === null || value === undefined || value === "") {
return [];
} else if (!(value instanceof Array)) {
return [new RegExp(value)];
} else {
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
}
}
addExclusion(value: string | RegExp) {
if (!value) {
return;
}
if (!(value instanceof RegExp)) {
value = new RegExp(value);
}
this.exclude.push(value);
}
removeExclusion(value: string) {
for (let i = 0; i < this.exclude.length; i++) {
if (this.exclude[i].toString() == value.toString()) {
this.exclude.splice(i, 1);
return true;
}
}
}
parseUrl(url: string, logDetails = {}) {
let parsedUrl = null;
try {
parsedUrl = new URL(url.trim());
} catch (e) {
logger.warn("Invalid Page - not a valid URL", { url, ...logDetails });
return null;
}
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
logger.warn("Invalid Page - URL must start with http:// or https://", {
url,
...logDetails,
});
parsedUrl = null;
}
return parsedUrl;
}
resolveSiteMap(sitemap: boolean | string | null): string | null {
if (sitemap === true) {
const url = new URL(this.url);
url.pathname = "/sitemap.xml";
return url.href;
} else if (typeof sitemap === "string") {
const url = new URL(sitemap, this.url);
return url.href;
}
return null;
}
scopeFromType(scopeType: ScopeType, parsedUrl: URL): [RegExp[], boolean] {
let include: RegExp[] = [];
let allowHash = false;
switch (scopeType) {
case "page":
include = [];
break;
case "page-spa":
// allow scheme-agnostic URLS as likely redirects
include = [
new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+"),
];
allowHash = true;
break;
case "prefix":
include = [
new RegExp(
"^" +
urlRxEscape(
parsedUrl.origin +
parsedUrl.pathname.slice(
0,
parsedUrl.pathname.lastIndexOf("/") + 1,
),
parsedUrl,
),
),
];
break;
case "host":
include = [
new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl)),
];
break;
case "domain":
if (parsedUrl.hostname.startsWith("www.")) {
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
}
include = [
new RegExp(
"^" +
urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace(
"\\/\\/",
"\\/\\/([^/]+\\.)*",
),
),
];
break;
case "any":
include = [/.*/];
break;
default:
logger.fatal(
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
);
}
return [include, allowHash];
}
isAtMaxDepth(depth: number) {
return depth >= this.maxDepth;
}
isIncluded(url: string, depth: number, extraHops = 0, logDetails = {}) {
if (depth > this.maxDepth) {
return false;
}
const urlParsed = this.parseUrl(url, logDetails);
if (!urlParsed) {
return false;
}
if (!this.allowHash) {
// remove hashtag
urlParsed.hash = "";
}
url = urlParsed.href;
if (url === this.url) {
return true;
}
// skip already crawled
// if (this.seenList.has(url)) {
// return false;
//}
let inScope = false;
// check scopes
for (const s of this.include) {
if (s.test(url)) {
inScope = true;
break;
}
}
let isOOS = false;
if (!inScope) {
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
isOOS = true;
} else {
//console.log(`Not in scope ${url} ${this.include}`);
return false;
}
}
// check exclusions
for (const e of this.exclude) {
if (e.test(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
}
}
return { url, isOOS };
}
}
export function rxEscape(string: string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}
export function urlRxEscape(url: string, parsedUrl: URL) {
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
}