browsertrix-crawler/tests/extra_hops_depth.test.js

import fs from "fs";

import util from "util";
import { exec as execCallback } from "child_process";

const exec = util.promisify(execCallback);

const extraHopsTimeout = 180000;

test(
  "check that URLs are crawled 2 extra hops beyond depth",
  async () => {
    try {
      await exec(
        "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 5 --timeout 10 --exclude community --exclude tools",
      );
    } catch (error) {
      console.log(error);
    }

    const crawledPages = fs.readFileSync(
      "test-crawls/collections/extra-hops-beyond/pages/pages.jsonl",
      "utf8",
    );
    const crawledPagesArray = crawledPages.trim().split("\n");

    const crawledExtraPages = fs.readFileSync(
      "test-crawls/collections/extra-hops-beyond/pages/extraPages.jsonl",
      "utf8",
    );
    const crawledExtraPagesArray = crawledExtraPages.trim().split("\n");

    const expectedPages = [
      "https://webrecorder.net/",
    ];

    const expectedExtraPages = [
      "https://webrecorder.net/blog",
      "https://webrecorder.net/about",
      "https://webrecorder.net/contact",
      "https://webrecorder.net/faq",
    ];

    // first line is the header, not page, so adding -1
    expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
    expect(crawledExtraPagesArray.length - 1).toEqual(expectedExtraPages.length);

    for (const page of crawledPagesArray) {
      const url = JSON.parse(page).url;
      if (!url) {
        continue;
      }
      expect(expectedPages.indexOf(url) >= 0).toBe(true);
    }

    for (const page of crawledExtraPagesArray) {
      const url = JSON.parse(page).url;
      if (!url) {
        continue;
      }
      expect(expectedExtraPages.indexOf(url) >= 0).toBe(true);
    }
  },
  extraHopsTimeout,
);