mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

Support for rollover size and custom WARC prefix templates: - reenable --rolloverSize (default to 1GB) for when a new WARC is created - support custom WARC prefix via --warcPrefix, prepended to new WARC filename, test via basic_crawl.test.js - filename template for new files is: `${prefix}-${crawlId}-$ts-${this.workerid}.warc${his.gzip ? ".gz" : ""}` with `$ts` replaced at new file creation time with current timestamp Improved support for long (non-terminating) responses, such as from live-streaming: - add a size to CDP takeStream to ensure data is streamed in fixed chunks, defaulting to 64k - change shutdown order: first close browser, then finish writing all WARCs to ensure any truncated responses can be captured. - ensure WARC is not rewritten after it is done, skip writing records if stream already flushed - add timeout to final fetch tasks to avoid never hanging on finish - fix adding `WARC-Truncated` header, need to set after stream is finished to determine if its been truncated - move temp download `tmp-dl` dir to main temp folder, outside of collection (no need to be there).
122 lines
3.7 KiB
JavaScript
122 lines
3.7 KiB
JavaScript
import child_process from "child_process";
|
|
import fs from "fs";
|
|
import path from "path";
|
|
import md5 from "md5";
|
|
|
|
test("ensure basic crawl run with docker run passes", async () => {
|
|
child_process.execSync(
|
|
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
|
|
);
|
|
|
|
child_process.execSync(
|
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
|
|
);
|
|
|
|
child_process.execSync(
|
|
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
|
|
);
|
|
});
|
|
|
|
test("check that individual WARCs have correct prefix and are under rollover size", () => {
|
|
const archiveWarcLists = fs.readdirSync(
|
|
"test-crawls/collections/wr-net/archive",
|
|
);
|
|
|
|
archiveWarcLists.forEach((filename) => {
|
|
expect(filename.startsWith("custom-prefix-")).toEqual(true);
|
|
const size = fs.statSync(
|
|
path.join("test-crawls/collections/wr-net/archive", filename),
|
|
).size;
|
|
expect(size < 10000).toEqual(true);
|
|
});
|
|
});
|
|
|
|
test("check that a combined warc file exists in the archive folder", () => {
|
|
const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
|
|
var captureFound = 0;
|
|
|
|
for (var i = 0; i < warcLists.length; i++) {
|
|
if (warcLists[i].endsWith("_0.warc.gz")) {
|
|
captureFound = 1;
|
|
}
|
|
}
|
|
expect(captureFound).toEqual(1);
|
|
});
|
|
|
|
test("check that a combined warc file is under the rolloverSize", () => {
|
|
const warcLists = fs.readdirSync(
|
|
path.join("test-crawls/collections/wr-net/wacz", "archive"),
|
|
);
|
|
let rolloverSize = 0;
|
|
|
|
function getFileSize(filename) {
|
|
return fs.statSync(filename).size;
|
|
}
|
|
|
|
for (let i = 0; i < warcLists.length; i++) {
|
|
const size = getFileSize(
|
|
path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]),
|
|
);
|
|
if (size < 10000) {
|
|
rolloverSize = 1;
|
|
}
|
|
}
|
|
expect(rolloverSize).toEqual(1);
|
|
});
|
|
|
|
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
});
|
|
|
|
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
});
|
|
|
|
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
|
const crawl_hash = md5(
|
|
JSON.parse(
|
|
fs
|
|
.readFileSync(
|
|
"test-crawls/collections/wr-net/wacz/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.split("\n")[1],
|
|
)["text"],
|
|
);
|
|
const wacz_hash = md5(
|
|
JSON.parse(
|
|
fs
|
|
.readFileSync(
|
|
"test-crawls/collections/wr-net/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.split("\n")[1],
|
|
)["text"],
|
|
);
|
|
const fixture_hash = md5(
|
|
JSON.parse(
|
|
fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1],
|
|
)["text"],
|
|
);
|
|
|
|
expect(wacz_hash).toEqual(fixture_hash);
|
|
expect(wacz_hash).toEqual(crawl_hash);
|
|
});
|
|
|
|
test("check that the supplied title and description made it into datapackage.json", () => {
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"),
|
|
).toBe(true);
|
|
|
|
const data = fs.readFileSync(
|
|
"test-crawls/collections/wr-net/wacz/datapackage.json",
|
|
"utf8",
|
|
);
|
|
const dataPackageJSON = JSON.parse(data);
|
|
expect(dataPackageJSON.title).toEqual("test title");
|
|
expect(dataPackageJSON.description).toEqual("test description");
|
|
});
|