From d72443ced39dcb4a644423b26c2d39a6fe631f1c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 15 Sep 2023 00:16:19 +0200 Subject: [PATCH] Add option to output stats file live, i.e. after each page crawled (#374) * Add option to output stats file live, i.e. after each page crawled * Always output stat files after each page crawled (+ test) * Fix inversion between expected and test value --- crawler.js | 6 +++--- tests/extra_hops_depth.test.js | 2 +- tests/file_stats.test.js | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) create mode 100644 tests/file_stats.test.js diff --git a/crawler.js b/crawler.js index c82a8ca0..2c43a050 100644 --- a/crawler.js +++ b/crawler.js @@ -820,7 +820,7 @@ self.__bx_behaviors.selectMainBehavior(); await this.pagesFH.close(); } - await this.writeStats(true); + await this.writeStats(); // extra wait for all resources to land into WARCs await this.awaitPendingClear(); @@ -1023,7 +1023,7 @@ self.__bx_behaviors.selectMainBehavior(); logger.debug("Memory", {maxHeapUsed: this.maxHeapUsed, maxHeapTotal: this.maxHeapTotal, ...memUsage}, "memory"); } - async writeStats(toFile=false) { + async writeStats() { if (!this.params.logging.includes("stats")) { return; } @@ -1046,7 +1046,7 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("Crawl statistics", stats, "crawlStatus"); this.logMemory(); - if (toFile && this.params.statsFilename) { + if (this.params.statsFilename) { try { await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2)); } catch (err) { diff --git a/tests/extra_hops_depth.test.js b/tests/extra_hops_depth.test.js index 2d0ddfbb..be6e48f6 100644 --- a/tests/extra_hops_depth.test.js +++ b/tests/extra_hops_depth.test.js @@ -30,7 +30,7 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => { ]; // first line is the header, not page, so adding -1 - expect(expectedPages.length).toEqual(crawledPagesArray.length - 1); + expect(crawledPagesArray.length - 1).toEqual(expectedPages.length); for (const page of crawledPagesArray) { const url = JSON.parse(page).url; diff --git a/tests/file_stats.test.js b/tests/file_stats.test.js new file mode 100644 index 00000000..3d7eae52 --- /dev/null +++ b/tests/file_stats.test.js @@ -0,0 +1,24 @@ +import child_process from "child_process"; +import fs from "fs"; + + +test("ensure crawl run with docker with stats file passes", async () => { + child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection file-stats --statsFilename progress.json"); + +}); + +test("check that a stats file exists", () => { + expect(fs.existsSync("test-crawls/progress.json")).toBe(true); +}); + +test("check that stats file format is correct", () => { + const data = fs.readFileSync("test-crawls/progress.json", "utf8"); + const dataJSON = JSON.parse(data); + expect(dataJSON.crawled).toEqual(1); + expect(dataJSON.total).toEqual(1); + expect(dataJSON.pending).toEqual(0); + expect(dataJSON.failed).toEqual(0); + expect(dataJSON.limit.max).toEqual(0); + expect(dataJSON.limit.hit).toBe(false); + expect(dataJSON.pendingPages.length).toEqual(0); +}); \ No newline at end of file