browsertrix-crawler/tests/text.test.js
Emma Dickson 24e2c4ddf8
Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33)
* generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc..
* each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC.
* add test for --combineWARC flag
* add improved lint rules

Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
2021-03-31 10:41:27 -07:00

22 lines
999 B
JavaScript

const fs = require("fs");
const md5 = require("md5");
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
expect(fs.existsSync("crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
});
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
expect(fs.existsSync("crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
});
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
const crawl_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
const wacz_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
expect(wacz_hash).toEqual(fixture_hash);
expect(wacz_hash).toEqual(crawl_hash);
});