browsertrix-crawler/crawler.js

const fs = require("fs");
const puppeteer = require("puppeteer-core");
const { Cluster } = require("puppeteer-cluster");
const child_process = require("child_process");
const fetch = require("node-fetch");
const AbortController = require("abort-controller");

const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
const NEW_CONTEXT_OPTS = ["page", "session", "browser"];
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";

// to ignore HTTPS error for HEAD check
const HTTPS_AGENT = require("https").Agent({
  rejectUnauthorized: false,
});

process.once('SIGINT', (code) => {
  console.log('SIGINT received, exiting');
  process.exit(1);
});

process.once('SIGTERM', (code) => {
  console.log('SIGTERM received, exiting');
  process.exit(1);
});


const autoplayScript = fs.readFileSync("./autoplay.js", "utf-8");


// prefix for direct capture via pywb
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
const headers = {"User-Agent": CHROME_USER_AGENT};


async function run(params) {
  // Chrome Flags, including proxy server
  const args = [
    "--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
    `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
    "--no-sandbox",
    "--disable-background-media-suspend",
    "--autoplay-policy=no-user-gesture-required",
  ];

  // Puppeter Options
  const puppeteerOptions = {
    headless: true,
    executablePath: "/opt/google/chrome/google-chrome",
    ignoreHTTPSErrors: true,
    args
  };

  // params
  const { url, waitUntil, timeout, scope, limit, exclude, scroll, newContext } = params;

  let concurrency = Cluster.CONCURRENCY_PAGE;

  switch (newContext) {
    case "page":
      concurrency = Cluster.CONCURRENCY_PAGE;
      break;

    case "session":
      concurrency = Cluster.CONCURRENCY_CONTEXT;
      break;

    case "browser":
      concurrency = Cluster.CONCURRENCY_BROWSER;
      break;
  }

  // Puppeteer Cluster init and options
  const cluster = await Cluster.launch({
    concurrency,
    maxConcurrency: Number(params.workers) || 1,
    skipDuplicateUrls: true,
    // total timeout for cluster
    timeout: timeout * 2,
    puppeteerOptions,
    puppeteer,
    monitor: true
  });

  // Maintain own seen list
  const seenList = new Set();

  //console.log("Limit: " + limit);

  // links crawled counter
  let numLinks = 0;

  // Crawl Task
  cluster.task(async ({page, data}) => {
    const {url} = data;

    if (!await htmlCheck(url, capturePrefix)) {
      return;
    }

    //page.on('console', message => console.log(`${message.type()} ${message.text()}`));
    //page.on('pageerror', message => console.warn(message));
    //page.on('error', message => console.warn(message));
    //page.on('requestfailed', message => console.warn(message._failureText));
    const mediaResults = [];

    await page.exposeFunction('__crawler_queueUrls', (url) => {
      mediaResults.push(directCapture(url));
    });

    let waitForVideo = false;

    await page.exposeFunction('__crawler_autoplayLoad', (url) => {
      console.log("*** Loading autoplay URL: " + url);
      waitForVideo = true;
    });

    try {
      await page.evaluateOnNewDocument(autoplayScript);
    } catch(e) {
      console.log(e);
    }

    try {
      await page.goto(url, {waitUntil, timeout});
    } catch (e) {
      console.log(`Load timeout for ${url}`);
    }

    try {
      await Promise.all(mediaResults);
    } catch (e) {
      console.log(`Error loading media URLs`, e);
    }

    if (waitForVideo) {
      console.log("Extra wait 15s for video loading");
      await sleep(15000);
    }

    if (scroll) {
      try {
        await Promise.race([page.evaluate(autoScroll), sleep(30000)]);
      } catch (e) {
        console.warn("Behavior Failed", e);
      }
    }

    let results = null;

    try {
      results = await page.evaluate(() => {
        return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href}))
      });
    } catch (e) {
      console.warn("Link Extraction failed", e);
      return;
    }

    try {
      for (data of results) {
        const newUrl = shouldCrawl(scope, seenList, data.url, exclude);

        if (newUrl) {
          seenList.add(newUrl);
          if (numLinks++ >= limit && limit > 0) {
            break;
          }
          cluster.queue({url: newUrl});
        }
      }
    } catch (e) {
      console.log("Queuing Error: " + e);
    }
  });

  numLinks++;
  cluster.queue({url});

  await cluster.idle();
  await cluster.close();

  // extra wait for all resources to land into WARCs
  console.log("Waiting 30s to ensure WARCs are finished");
  await sleep(30000);
}


function shouldCrawl(scope, seenList, url, exclude) {
  try {
    url = new URL(url);
  } catch(e) {
    return false;
  }

  // remove hashtag
  url.hash = "";

  // only queue http/https URLs
  if (url.protocol != "http:" && url.protocol != "https:") {
    return false;
  }

  url = url.href;

  // skip already crawled
  if (seenList.has(url)) {
    return false;
  }

  let inScope = false;

  // check scopes
  for (const s of scope) {
    if (s.exec(url)) {
      inScope = true;
      break;
    }
  }

  if (!inScope) {
    //console.log(`Not in scope ${url} ${scope}`);
    return false;
  }

  // check exclusions
  for (const e of exclude) {
    if (e.exec(url)) {
      //console.log(`Skipping ${url} excluded by ${e}`);
      return false;
    }
  }

  return url;
}

async function htmlCheck(url, capturePrefix) {
  try {
    const agent = url.startsWith("https:") ? HTTPS_AGENT : null;

    const resp = await fetch(url, {method: "HEAD", headers, agent});

    if (resp.status >= 400) {
      console.log(`Skipping ${url}, invalid status ${resp.status}`);
      return false;
    }

    const contentType = resp.headers.get("Content-Type");

    // just load if no content-type
    if (!contentType) {
      return true;
    }

    const mime = contentType.split(";")[0];

    if (HTML_TYPES.includes(mime)) {
      return true;
    }

    // capture directly
    await directCapture(url);

    return false;
  } catch(e) {
    console.log("HTML Check error", e);
    // can't confirm not html, so try in browser
    return true;
  }
}

async function directCapture(url) {
  console.log(`Direct capture: ${capturePrefix}${url}`);
  const abort = new AbortController();
  const signal = abort.signal;
  const resp2 = await fetch(capturePrefix + url, {signal, headers});
  abort.abort();
}


async function autoScroll() {
  const canScrollMore = () =>
    self.scrollY + self.innerHeight <
    Math.max(
      self.document.body.scrollHeight,
      self.document.body.offsetHeight,
      self.document.documentElement.clientHeight,
      self.document.documentElement.scrollHeight,
      self.document.documentElement.offsetHeight
    );

  const scrollOpts = { top: 250, left: 0, behavior: 'auto' };

  while (canScrollMore()) {
    self.scrollBy(scrollOpts);
    await new Promise(resolve => setTimeout(resolve, 500));
  }
}

function sleep(time) {
  return new Promise(resolve => setTimeout(resolve, time));
}


async function main() {
  const params = require('yargs')
  .usage("browsertrix-crawler [options]")
  .options({
    "url": {
      alias: "u",
      describe: "The URL to start crawling from",
      demandOption: true,
      type: "string",
    },

    "workers": {
      alias: "w",
      describe: "The number of workers to run in parallel",
      demandOption: false,
      default: 1,
      type: "number",
    },

    "newContext": {
      describe: "The context for each new capture, can be a new: page, session or browser.",
      default: "page",
      type: "string"
    },

    "waitUntil": {
      describe: "Puppeteer page.goto() condition to wait for before continuing",
      default: "load",
    },

    "limit": {
      describe: "Limit crawl to this number of pages",
      default: 0,
      type: "number",
    },

    "timeout": {
      describe: "Timeout for each page to load (in seconds)",
      default: 90,
      type: "number",
    },

    "scope": {
      describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
    },

    "exclude": {
      describe: "Regex of page URLs that should be excluded from the crawl."
    },

    "scroll": {
      describe: "If set, will autoscroll to bottom of the page",
      type: "boolean",
      default: false,

    }}).check((argv, option) => {
      // Scope for crawl, default to the domain of the URL
      const url = new URL(argv.url);

      if (url.protocol !== "http:" && url.protocol != "https:") {
        throw new Error("URL must start with http:// or https://");
      }

      // ensure valid url is used (adds trailing slash if missing)
      argv.url = url.href;

      if (!argv.scope) {
        //argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
        argv.scope = [new RegExp("^" + rxEscape(url.href.slice(0, url.href.lastIndexOf("/") + 1)))];
      }

      argv.timeout *= 1000;

      // waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
      // (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
      if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
        throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(","));
      }

      if (!NEW_CONTEXT_OPTS.includes(argv.newContext)) {
        throw new Error("Invalid newContext, must be one of: " + NEW_CONTEXT_OPTS.join(","));
      }

      // Support one or multiple exclude
      if (argv.exclude) {
        if (typeof(argv.exclude) === "string") {
          argv.exclude = [new RegExp(argv.exclude)];
        } else {
          argv.exclude = argv.exclude.map(e => new RegExp(e));
        }
      } else {
        argv.exclude = [];
      }

      // Support one or multiple scopes
      if (argv.scope) {
        if (typeof(argv.scope) === "string") {
          argv.scope = [new RegExp(argv.scope)];
        } else {
          argv.scope = argv.scope.map(e => new RegExp(e));
        }
      } else {
        argv.scope = [];
      }

      return true;
    })
  .argv;

  console.log("Exclusions Regexes: ", params.exclude);
  console.log("Scope Regexes: ", params.scope);

  try {
    await run(params);
    process.exit(0);
  } catch(e) {
    console.error("Crawl failed");
    console.error(e);
    process.exit(1);
  }
}

function rxEscape(string) {
  return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}


main();