browsertrix-crawler/tests/file_stats.test.js

import child_process from "child_process";
import fs from "fs";

test("ensure that stats file is modified", async () => {
  const child = child_process.exec(
    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --generateWACZ --text --limit 3 --exclude community --collection file-stats --statsFilename progress.json",
  );

  // detect crawler exit
  let crawler_exited = false;
  child.on("exit", function () {
    crawler_exited = true;
  });

  // helper function to sleep
  const sleep = (ms) => new Promise((res) => setTimeout(res, ms));

  // wait for stats file creation up to 30 secs (to not wait indefinitely)
  let counter = 0;
  while (!fs.existsSync("test-crawls/progress.json")) {
    await sleep(100);
    counter++;
    expect(counter < 300).toBe(true);
  }

  // get initial modification time
  const initial_mtime = fs.fstatSync(
    fs.openSync("test-crawls/progress.json", "r"),
  ).mtime;

  // wait for crawler exit
  while (!crawler_exited) {
    await sleep(100);
  }

  // get final modification time
  const final_mtime = fs.fstatSync(
    fs.openSync("test-crawls/progress.json", "r"),
  ).mtime;

  // compare initial and final modification time
  const diff = Math.abs(final_mtime - initial_mtime);
  expect(diff > 0).toBe(true);
});

test("check that stats file format is correct", () => {
  const data = fs.readFileSync("test-crawls/progress.json", "utf8");
  const dataJSON = JSON.parse(data);
  expect(dataJSON.crawled).toEqual(3);
  expect(dataJSON.total).toEqual(3);
  expect(dataJSON.pending).toEqual(0);
  expect(dataJSON.failed).toEqual(0);
  expect(dataJSON.limit.max).toEqual(3);
  expect(dataJSON.limit.hit).toBe(true);
  expect(dataJSON.pendingPages.length).toEqual(0);
});