mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

* generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
20 lines
No EOL
582 B
JavaScript
20 lines
No EOL
582 B
JavaScript
const fs = require("fs");
|
|
const path = require("path");
|
|
|
|
function getFileSize(filename) {
|
|
var stats = fs.statSync(filename);
|
|
return stats.size;
|
|
}
|
|
|
|
test("check that a combined warc file is under the rolloverSize", () => {
|
|
const warcLists = fs.readdirSync(path.join("crawls/collections/wr-net/wacz", "archive"));
|
|
var rolloverSize = 0;
|
|
|
|
for (var i = 0; i < warcLists.length; i++) {
|
|
var size = getFileSize(path.join("crawls/collections/wr-net/wacz/archive/", warcLists[i]));
|
|
if (size < 10000){
|
|
rolloverSize = 1;
|
|
}
|
|
}
|
|
expect(rolloverSize).toEqual(1);
|
|
}); |