browsertrix-crawler/tests/rollover-writer.test.js
Ilya Kreymer f6edec0b95
Fix for --rolloverSize for individual WARCs in 1.x (#542)
Fixes #533 

Fixes rollover in WARCWriter, separate from combined WARC rollover size:
- check rolloverSize and close previous WARCs when size exceeds
- add timestamp to resource WARC filenames to support rollover, eg.
screenshots-{ts}.warc.gz
- use append mode for all write streams, just in case
- tests: add test for rollover of individual WARCs with 500K size limit
- tests: update screenshot tests to account for WARCs now being named
screenshots-{ts}.warc.gz instead of just screenshots.warc.gz
2024-04-15 13:43:08 -07:00

30 lines
866 B
JavaScript

import child_process from "child_process";
import fs from "fs";
test("set rollover to 500K and ensure individual WARCs rollover, including screenshots", async () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --collection rollover-500K --rolloverSize 500000 --screenshot view"
);
const warcLists = fs.readdirSync("test-crawls/collections/rollover-500K/archive");
let main = 0;
let screenshots = 0;
console.log(warcLists);
for (const name of warcLists) {
if (name.startsWith("rec-")) {
main++;
} else if (name.startsWith("screenshots-")) {
screenshots++;
}
}
// expect at least 6 main WARCs
expect(main).toBeGreaterThan(5);
// expect at least 2 screenshot WARCs
expect(screenshots).toBeGreaterThan(1);
});