mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Add option to output stats file live, i.e. after each page crawled (#374)
* Add option to output stats file live, i.e. after each page crawled * Always output stat files after each page crawled (+ test) * Fix inversion between expected and test value
This commit is contained in:
parent
afecec01bd
commit
d72443ced3
3 changed files with 28 additions and 4 deletions
|
@ -820,7 +820,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.pagesFH.close();
|
await this.pagesFH.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.writeStats(true);
|
await this.writeStats();
|
||||||
|
|
||||||
// extra wait for all resources to land into WARCs
|
// extra wait for all resources to land into WARCs
|
||||||
await this.awaitPendingClear();
|
await this.awaitPendingClear();
|
||||||
|
@ -1023,7 +1023,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.debug("Memory", {maxHeapUsed: this.maxHeapUsed, maxHeapTotal: this.maxHeapTotal, ...memUsage}, "memory");
|
logger.debug("Memory", {maxHeapUsed: this.maxHeapUsed, maxHeapTotal: this.maxHeapTotal, ...memUsage}, "memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
async writeStats(toFile=false) {
|
async writeStats() {
|
||||||
if (!this.params.logging.includes("stats")) {
|
if (!this.params.logging.includes("stats")) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1046,7 +1046,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.info("Crawl statistics", stats, "crawlStatus");
|
logger.info("Crawl statistics", stats, "crawlStatus");
|
||||||
this.logMemory();
|
this.logMemory();
|
||||||
|
|
||||||
if (toFile && this.params.statsFilename) {
|
if (this.params.statsFilename) {
|
||||||
try {
|
try {
|
||||||
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
|
|
@ -30,7 +30,7 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
||||||
];
|
];
|
||||||
|
|
||||||
// first line is the header, not page, so adding -1
|
// first line is the header, not page, so adding -1
|
||||||
expect(expectedPages.length).toEqual(crawledPagesArray.length - 1);
|
expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
|
||||||
|
|
||||||
for (const page of crawledPagesArray) {
|
for (const page of crawledPagesArray) {
|
||||||
const url = JSON.parse(page).url;
|
const url = JSON.parse(page).url;
|
||||||
|
|
24
tests/file_stats.test.js
Normal file
24
tests/file_stats.test.js
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
import child_process from "child_process";
|
||||||
|
import fs from "fs";
|
||||||
|
|
||||||
|
|
||||||
|
test("ensure crawl run with docker with stats file passes", async () => {
|
||||||
|
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection file-stats --statsFilename progress.json");
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check that a stats file exists", () => {
|
||||||
|
expect(fs.existsSync("test-crawls/progress.json")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check that stats file format is correct", () => {
|
||||||
|
const data = fs.readFileSync("test-crawls/progress.json", "utf8");
|
||||||
|
const dataJSON = JSON.parse(data);
|
||||||
|
expect(dataJSON.crawled).toEqual(1);
|
||||||
|
expect(dataJSON.total).toEqual(1);
|
||||||
|
expect(dataJSON.pending).toEqual(0);
|
||||||
|
expect(dataJSON.failed).toEqual(0);
|
||||||
|
expect(dataJSON.limit.max).toEqual(0);
|
||||||
|
expect(dataJSON.limit.hit).toBe(false);
|
||||||
|
expect(dataJSON.pendingPages.length).toEqual(0);
|
||||||
|
});
|
Loading…
Add table
Add a link
Reference in a new issue