browsertrix-crawler/util/seeds.js

import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js";


export class ScopedSeed
{
  constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
    const parsedUrl = this.parseUrl(url);
    if (!parsedUrl) {
      logger.fatal(`Invalid Seed "${url}" specified, aborting crawl.`);
    }
    this.url = parsedUrl.href;
    this.include = this.parseRx(include);
    this.exclude = this.parseRx(exclude);
    this.scopeType = scopeType;

    if (!this.scopeType) {
      this.scopeType = this.include.length ? "custom" : "prefix";
    }

    if (this.scopeType !== "custom") {
      [include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
      this.include = [...include, ...this.include];
    }

    this.sitemap = this.resolveSiteMap(sitemap);
    this.allowHash = allowHash;
    this.maxExtraHops = extraHops;
    this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
  }

  parseRx(value) {
    if (value === null || value === undefined || value === "") {
      return [];
    } else if (!(value instanceof Array)) {
      return [new RegExp(value)];
    } else {
      return value.map(e => (e instanceof RegExp) ? e : new RegExp(e));
    }
  }

  parseUrl(url, logDetails = {}) {
    let parsedUrl = null;
    try {
      parsedUrl = new URL(url.trim());
    } catch (e) {
      logger.warn("Invalid Page - not a valid URL", {url, ...logDetails});
      return null;
    }

    if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
      logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails});
      parsedUrl = null;
    }

    return parsedUrl;
  }

  resolveSiteMap(sitemap) {
    if (sitemap === true) {
      const url = new URL(this.url);
      url.pathname = "/sitemap.xml";
      return url.href;
    }

    return sitemap;
  }

  scopeFromType(scopeType, parsedUrl) {
    let include;
    let allowHash = false;

    switch (scopeType) {
    case "page":
      include = [];
      break;

    case "page-spa":
      // allow scheme-agnostic URLS as likely redirects
      include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
      allowHash = true;
      break;

    case "prefix":
      include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
      break;

    case "host":
      include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
      break;

    case "domain":
      if (parsedUrl.hostname.startsWith("www.")) {
        parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
      }
      include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
      break;

    case "any":
      include = [/.*/];
      break;

    default:
      logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
    }

    return [include, allowHash];
  }

  isAtMaxDepth(depth) {
    return depth >= this.maxDepth;
  }

  isIncluded(url, depth, extraHops = 0, logDetails = {}) {
    if (depth > this.maxDepth) {
      return false;
    }

    url = this.parseUrl(url, logDetails);
    if (!url) {
      return false;
    }

    if (!this.allowHash) {
      // remove hashtag
      url.hash = "";
    }

    url = url.href;

    if (url === this.url) {
      return true;
    }

    // skip already crawled
    // if (this.seenList.has(url)) {
    //  return false;
    //}
    let inScope = false;

    // check scopes
    for (const s of this.include) {
      if (s.test(url)) {
        inScope = true;
        break;
      }
    }

    let isOOS = false;

    if (!inScope) {
      if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
        isOOS = true;
      } else {
        //console.log(`Not in scope ${url} ${this.include}`);
        return false;
      }
    }

    // check exclusions
    for (const e of this.exclude) {
      if (e.test(url)) {
        //console.log(`Skipping ${url} excluded by ${e}`);
        return false;
      }
    }

    return {url, isOOS};
  }
}

export function rxEscape(string) {
  return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}

export function urlRxEscape(url, parsedUrl) {
  return rxEscape(url).replace(parsedUrl.protocol, "https?:");
}