2022-10-24 15:30:10 +02:00
|
|
|
import fs from "fs";
|
|
|
|
import zlib from "zlib";
|
2024-05-22 15:47:05 -07:00
|
|
|
import path from "path";
|
2022-10-24 15:30:10 +02:00
|
|
|
import child_process from "child_process";
|
2021-07-07 18:56:52 -04:00
|
|
|
|
2024-05-22 15:47:05 -07:00
|
|
|
test("run crawl", async() => {
|
|
|
|
let success = false;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
try {
|
2021-07-07 18:56:52 -04:00
|
|
|
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
2023-11-09 19:11:11 -05:00
|
|
|
const proc = child_process.execSync(
|
|
|
|
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC",
|
|
|
|
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
|
|
|
);
|
2021-07-07 18:56:52 -04:00
|
|
|
|
2024-06-26 13:05:13 -07:00
|
|
|
//console.log(proc);
|
2024-05-22 15:47:05 -07:00
|
|
|
success = true;
|
2023-11-09 19:11:11 -05:00
|
|
|
} catch (error) {
|
2021-07-07 18:56:52 -04:00
|
|
|
console.log(error);
|
|
|
|
}
|
|
|
|
|
2024-05-22 15:47:05 -07:00
|
|
|
expect(success).toBe(true);
|
|
|
|
});
|
|
|
|
|
|
|
|
test("check that the warcinfo for individual WARC is as expected", async () => {
|
|
|
|
|
|
|
|
const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/");
|
|
|
|
|
|
|
|
let filename = "";
|
|
|
|
|
|
|
|
for (const name of warcs) {
|
|
|
|
if (name.startsWith("rec-")) {
|
|
|
|
filename = path.join("test-crawls/collections/warcinfo/archive/", name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const warcData = fs.readFileSync(filename);
|
|
|
|
|
|
|
|
const data = zlib.gunzipSync(warcData);
|
|
|
|
|
|
|
|
const string = data.toString("utf8");
|
|
|
|
|
|
|
|
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
|
|
|
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
|
|
|
expect(
|
|
|
|
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
|
|
|
|
).not.toEqual(null);
|
|
|
|
expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1);
|
|
|
|
});
|
|
|
|
|
|
|
|
test("check that the warcinfo for combined WARC file is as expected", async () => {
|
2023-11-09 19:11:11 -05:00
|
|
|
const warcData = fs.readFileSync(
|
|
|
|
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
|
|
|
|
);
|
2021-07-07 18:56:52 -04:00
|
|
|
|
|
|
|
const data = zlib.gunzipSync(warcData);
|
|
|
|
|
|
|
|
const string = data.toString("utf8");
|
|
|
|
|
|
|
|
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
|
|
|
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
2023-11-09 19:11:11 -05:00
|
|
|
expect(
|
|
|
|
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
|
|
|
|
).not.toEqual(null);
|
2024-04-18 21:52:24 -07:00
|
|
|
expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1);
|
2021-07-07 18:56:52 -04:00
|
|
|
});
|