mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

Fixes #533 Fixes rollover in WARCWriter, separate from combined WARC rollover size: - check rolloverSize and close previous WARCs when size exceeds - add timestamp to resource WARC filenames to support rollover, eg. screenshots-{ts}.warc.gz - use append mode for all write streams, just in case - tests: add test for rollover of individual WARCs with 500K size limit - tests: update screenshot tests to account for WARCs now being named screenshots-{ts}.warc.gz instead of just screenshots.warc.gz
30 lines
866 B
JavaScript
30 lines
866 B
JavaScript
import child_process from "child_process";
|
|
import fs from "fs";
|
|
|
|
test("set rollover to 500K and ensure individual WARCs rollover, including screenshots", async () => {
|
|
child_process.execSync(
|
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --collection rollover-500K --rolloverSize 500000 --screenshot view"
|
|
);
|
|
|
|
const warcLists = fs.readdirSync("test-crawls/collections/rollover-500K/archive");
|
|
|
|
let main = 0;
|
|
let screenshots = 0;
|
|
|
|
console.log(warcLists);
|
|
|
|
for (const name of warcLists) {
|
|
if (name.startsWith("rec-")) {
|
|
main++;
|
|
} else if (name.startsWith("screenshots-")) {
|
|
screenshots++;
|
|
}
|
|
}
|
|
|
|
// expect at least 6 main WARCs
|
|
expect(main).toBeGreaterThan(5);
|
|
|
|
// expect at least 2 screenshot WARCs
|
|
expect(screenshots).toBeGreaterThan(1);
|
|
|
|
});
|