browsertrix-crawler/tests/extra_hops_depth.test.js

import fs from "fs";

import util from "util";
import {exec as execCallback } from "child_process";

const exec = util.promisify(execCallback);

const extraHopsTimeout = 180000;


test("check that URLs are crawled 2 extra hops beyond depth", async () => {
  try {
    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7");
  }
  catch (error) {
    console.log(error);
  }

  const crawledPages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
  const crawledPagesArray = crawledPages.trim().split("\n");

  const expectedPages = [
    "https://webrecorder.net/",
    "https://webrecorder.net/blog",
    "https://webrecorder.net/tools",
    "https://webrecorder.net/community",
    "https://webrecorder.net/about",
    "https://webrecorder.net/contact",
    "https://webrecorder.net/faq",
  ];

  // first line is the header, not page, so adding -1
  expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);

  for (const page of crawledPagesArray) {
    const url = JSON.parse(page).url;
    if (!url) {
      continue;
    }
    expect(expectedPages.indexOf(url) >= 0).toBe(true);
  }
}, extraHopsTimeout);