Add option to output stats file live, i.e. after each page crawled (#374)

* Add option to output stats file live, i.e. after each page crawled * Always output stat files after each page crawled (+ test) * Fix inversion between expected and test value
2025-12-08 06:09:48 +00:00 · 2023-09-15 00:16:19 +02:00 · 2023-09-15 00:16:19 +02:00 · d72443ced3
commit d72443ced3
parent afecec01bd
3 changed files with 28 additions and 4 deletions
--- a/crawler.js
+++ b/crawler.js
@ -820,7 +820,7 @@ self.__bx_behaviors.selectMainBehavior();
      await this.pagesFH.close();
    }

-    await this.writeStats(true);
+    await this.writeStats();

    // extra wait for all resources to land into WARCs
    await this.awaitPendingClear();
@ -1023,7 +1023,7 @@ self.__bx_behaviors.selectMainBehavior();
    logger.debug("Memory", {maxHeapUsed: this.maxHeapUsed, maxHeapTotal: this.maxHeapTotal, ...memUsage}, "memory");
  }

-  async writeStats(toFile=false) {
+  async writeStats() {
    if (!this.params.logging.includes("stats")) {
      return;
    }
@ -1046,7 +1046,7 @@ self.__bx_behaviors.selectMainBehavior();
    logger.info("Crawl statistics", stats, "crawlStatus");
    this.logMemory();

-    if (toFile && this.params.statsFilename) {
+    if (this.params.statsFilename) {
      try {
        await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
      } catch (err) {
--- a/tests/extra_hops_depth.test.js
+++ b/tests/extra_hops_depth.test.js
@ -30,7 +30,7 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
  ];

  // first line is the header, not page, so adding -1
-  expect(expectedPages.length).toEqual(crawledPagesArray.length - 1);
+  expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);

  for (const page of crawledPagesArray) {
    const url = JSON.parse(page).url;
--- a/tests/file_stats.test.js
+++ b/tests/file_stats.test.js
@ -0,0 +1,24 @@
+import child_process from "child_process";
+import fs from "fs";
+
+
+test("ensure crawl run with docker with stats file passes", async () => {
+  child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ  --text --collection file-stats --statsFilename progress.json");
+
+});
+
+test("check that a stats file exists", () => {
+  expect(fs.existsSync("test-crawls/progress.json")).toBe(true);
+});
+
+test("check that stats file format is correct", () => {
+  const data = fs.readFileSync("test-crawls/progress.json", "utf8");
+  const dataJSON = JSON.parse(data);
+  expect(dataJSON.crawled).toEqual(1);
+  expect(dataJSON.total).toEqual(1);
+  expect(dataJSON.pending).toEqual(0);
+  expect(dataJSON.failed).toEqual(0);
+  expect(dataJSON.limit.max).toEqual(0);
+  expect(dataJSON.limit.hit).toBe(false);
+  expect(dataJSON.pendingPages.length).toEqual(0);
+});