2022-10-24 15:30:10 +02:00
|
|
|
import child_process from "child_process";
|
|
|
|
import fs from "fs";
|
|
|
|
import path from "path";
|
|
|
|
import md5 from "md5";
|
|
|
|
|
2023-11-08 16:40:49 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
test("ensure basic crawl run with docker run passes", async () => {
|
2023-11-08 16:40:49 -05:00
|
|
|
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\"");
|
2021-07-23 18:31:43 -07:00
|
|
|
|
2023-11-08 16:40:49 -05:00
|
|
|
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
|
|
|
|
|
|
|
|
child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz");
|
2021-07-23 18:31:43 -07:00
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
test("check that a combined warc file exists in the archive folder", () => {
|
|
|
|
const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
|
|
|
|
var captureFound = 0;
|
2023-11-08 16:40:49 -05:00
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
for (var i = 0; i < warcLists.length; i++) {
|
2023-11-08 16:40:49 -05:00
|
|
|
if (warcLists[i].endsWith("_0.warc.gz")){
|
2021-07-23 18:31:43 -07:00
|
|
|
captureFound = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
expect(captureFound).toEqual(1);
|
|
|
|
});
|
|
|
|
|
2023-11-08 16:40:49 -05:00
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
test("check that a combined warc file is under the rolloverSize", () => {
|
2023-11-08 16:40:49 -05:00
|
|
|
const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive"));
|
2021-07-23 18:31:43 -07:00
|
|
|
let rolloverSize = 0;
|
|
|
|
|
|
|
|
function getFileSize(filename) {
|
|
|
|
return fs.statSync(filename).size;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (let i = 0; i < warcLists.length; i++) {
|
2023-11-08 16:40:49 -05:00
|
|
|
const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]));
|
|
|
|
if (size < 10000){
|
2021-07-23 18:31:43 -07:00
|
|
|
rolloverSize = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
expect(rolloverSize).toEqual(1);
|
|
|
|
});
|
|
|
|
|
|
|
|
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
2023-11-08 16:40:49 -05:00
|
|
|
expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
|
2021-07-23 18:31:43 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
2023-11-08 16:40:49 -05:00
|
|
|
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
|
2021-07-23 18:31:43 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
2023-11-08 16:40:49 -05:00
|
|
|
const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
|
|
|
const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
|
|
|
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
expect(wacz_hash).toEqual(fixture_hash);
|
|
|
|
expect(wacz_hash).toEqual(crawl_hash);
|
2023-11-08 16:40:49 -05:00
|
|
|
|
2021-07-23 18:31:43 -07:00
|
|
|
});
|
|
|
|
|
2023-04-04 10:46:03 -04:00
|
|
|
test("check that the supplied title and description made it into datapackage.json", () => {
|
2023-11-08 16:40:49 -05:00
|
|
|
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json")).toBe(true);
|
|
|
|
|
|
|
|
const data = fs.readFileSync("test-crawls/collections/wr-net/wacz/datapackage.json", "utf8");
|
2023-04-04 10:46:03 -04:00
|
|
|
const dataPackageJSON = JSON.parse(data);
|
|
|
|
expect(dataPackageJSON.title).toEqual("test title");
|
|
|
|
expect(dataPackageJSON.description).toEqual("test description");
|
|
|
|
});
|