mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed.
279 lines
6.3 KiB
TypeScript
279 lines
6.3 KiB
TypeScript
import { logger } from "./logger.js";
|
|
import { MAX_DEPTH } from "./constants.js";
|
|
|
|
type ScopeType =
|
|
| "prefix"
|
|
| "host"
|
|
| "domain"
|
|
| "page"
|
|
| "page-spa"
|
|
| "any"
|
|
| "custom";
|
|
|
|
export class ScopedSeed {
|
|
url: string;
|
|
scopeType: ScopeType;
|
|
include: RegExp[];
|
|
exclude: RegExp[] = [];
|
|
allowHash = false;
|
|
depth = -1;
|
|
sitemap?: string | null;
|
|
extraHops = 0;
|
|
|
|
maxExtraHops = 0;
|
|
maxDepth = 0;
|
|
|
|
constructor({
|
|
url,
|
|
scopeType,
|
|
include,
|
|
exclude = [],
|
|
allowHash = false,
|
|
depth = -1,
|
|
sitemap = false,
|
|
extraHops = 0,
|
|
}: {
|
|
url: string;
|
|
scopeType: ScopeType;
|
|
include: string[];
|
|
exclude?: string[];
|
|
allowHash?: boolean;
|
|
depth?: number;
|
|
sitemap?: string | boolean | null;
|
|
extraHops?: number;
|
|
}) {
|
|
const parsedUrl = this.parseUrl(url);
|
|
if (!parsedUrl) {
|
|
throw new Error("Invalid URL");
|
|
}
|
|
this.url = parsedUrl.href;
|
|
this.include = this.parseRx(include);
|
|
this.exclude = this.parseRx(exclude);
|
|
this.scopeType = scopeType;
|
|
|
|
if (!this.scopeType) {
|
|
this.scopeType = this.include.length ? "custom" : "prefix";
|
|
}
|
|
|
|
if (this.scopeType !== "custom") {
|
|
const [includeNew, allowHashNew] = this.scopeFromType(
|
|
this.scopeType,
|
|
parsedUrl,
|
|
);
|
|
this.include = [...includeNew, ...this.include];
|
|
allowHash = allowHashNew;
|
|
}
|
|
|
|
// for page scope, the depth is set to extraHops, as no other
|
|
// crawling is done
|
|
if (this.scopeType === "page") {
|
|
depth = extraHops;
|
|
}
|
|
|
|
this.sitemap = this.resolveSiteMap(sitemap);
|
|
this.allowHash = allowHash;
|
|
this.maxExtraHops = extraHops;
|
|
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
|
|
}
|
|
|
|
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
|
|
// TODO: Fix this the next time the file is edited.
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
parseRx(value: any) {
|
|
if (value === null || value === undefined || value === "") {
|
|
return [];
|
|
} else if (!(value instanceof Array)) {
|
|
return [new RegExp(value)];
|
|
} else {
|
|
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
|
|
}
|
|
}
|
|
|
|
addExclusion(value: string | RegExp) {
|
|
if (!value) {
|
|
return;
|
|
}
|
|
if (!(value instanceof RegExp)) {
|
|
value = new RegExp(value);
|
|
}
|
|
this.exclude.push(value);
|
|
}
|
|
|
|
removeExclusion(value: string) {
|
|
for (let i = 0; i < this.exclude.length; i++) {
|
|
if (this.exclude[i].toString() == value.toString()) {
|
|
this.exclude.splice(i, 1);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
parseUrl(url: string, logDetails = {}) {
|
|
let parsedUrl = null;
|
|
try {
|
|
parsedUrl = new URL(url.trim());
|
|
} catch (e) {
|
|
logger.warn("Invalid Page - not a valid URL", { url, ...logDetails });
|
|
return null;
|
|
}
|
|
|
|
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
|
logger.warn("Invalid Page - URL must start with http:// or https://", {
|
|
url,
|
|
...logDetails,
|
|
});
|
|
parsedUrl = null;
|
|
}
|
|
|
|
return parsedUrl;
|
|
}
|
|
|
|
resolveSiteMap(sitemap: boolean | string | null): string | null {
|
|
if (sitemap === true) {
|
|
const url = new URL(this.url);
|
|
url.pathname = "/sitemap.xml";
|
|
return url.href;
|
|
} else if (typeof sitemap === "string") {
|
|
const url = new URL(sitemap, this.url);
|
|
return url.href;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
scopeFromType(scopeType: ScopeType, parsedUrl: URL): [RegExp[], boolean] {
|
|
let include: RegExp[] = [];
|
|
let allowHash = false;
|
|
|
|
switch (scopeType) {
|
|
case "page":
|
|
include = [];
|
|
break;
|
|
|
|
case "page-spa":
|
|
// allow scheme-agnostic URLS as likely redirects
|
|
include = [
|
|
new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+"),
|
|
];
|
|
allowHash = true;
|
|
break;
|
|
|
|
case "prefix":
|
|
include = [
|
|
new RegExp(
|
|
"^" +
|
|
urlRxEscape(
|
|
parsedUrl.origin +
|
|
parsedUrl.pathname.slice(
|
|
0,
|
|
parsedUrl.pathname.lastIndexOf("/") + 1,
|
|
),
|
|
parsedUrl,
|
|
),
|
|
),
|
|
];
|
|
break;
|
|
|
|
case "host":
|
|
include = [
|
|
new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl)),
|
|
];
|
|
break;
|
|
|
|
case "domain":
|
|
if (parsedUrl.hostname.startsWith("www.")) {
|
|
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
|
}
|
|
include = [
|
|
new RegExp(
|
|
"^" +
|
|
urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace(
|
|
"\\/\\/",
|
|
"\\/\\/([^/]+\\.)*",
|
|
),
|
|
),
|
|
];
|
|
break;
|
|
|
|
case "any":
|
|
include = [/.*/];
|
|
break;
|
|
|
|
default:
|
|
logger.fatal(
|
|
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
|
|
);
|
|
}
|
|
|
|
return [include, allowHash];
|
|
}
|
|
|
|
isAtMaxDepth(depth: number) {
|
|
return depth >= this.maxDepth;
|
|
}
|
|
|
|
isIncluded(url: string, depth: number, extraHops = 0, logDetails = {}) {
|
|
if (depth > this.maxDepth) {
|
|
return false;
|
|
}
|
|
|
|
const urlParsed = this.parseUrl(url, logDetails);
|
|
if (!urlParsed) {
|
|
return false;
|
|
}
|
|
|
|
if (!this.allowHash) {
|
|
// remove hashtag
|
|
urlParsed.hash = "";
|
|
}
|
|
|
|
url = urlParsed.href;
|
|
|
|
if (url === this.url) {
|
|
return true;
|
|
}
|
|
|
|
// skip already crawled
|
|
// if (this.seenList.has(url)) {
|
|
// return false;
|
|
//}
|
|
let inScope = false;
|
|
|
|
// check scopes
|
|
for (const s of this.include) {
|
|
if (s.test(url)) {
|
|
inScope = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
let isOOS = false;
|
|
|
|
if (!inScope) {
|
|
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
|
isOOS = true;
|
|
} else {
|
|
//console.log(`Not in scope ${url} ${this.include}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// check exclusions
|
|
for (const e of this.exclude) {
|
|
if (e.test(url)) {
|
|
//console.log(`Skipping ${url} excluded by ${e}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return { url, isOOS };
|
|
}
|
|
}
|
|
|
|
export function rxEscape(string: string) {
|
|
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
}
|
|
|
|
export function urlRxEscape(url: string, parsedUrl: URL) {
|
|
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
|
}
|