2023-10-31 23:05:30 -07:00
|
|
|
import fs from "fs";
|
|
|
|
import child_process from "child_process";
|
|
|
|
|
|
|
|
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
child_process.execSync(
|
|
|
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc",
|
|
|
|
);
|
2023-10-31 23:05:30 -07:00
|
|
|
} catch (error) {
|
|
|
|
//console.log(new TextDecoder().decode(error));
|
|
|
|
console.log(error.stderr);
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const data = fs.readFileSync(
|
|
|
|
"test-crawls/collections/text-extract/indexes/index.cdxj",
|
|
|
|
{ encoding: "utf-8" },
|
|
|
|
);
|
2023-10-31 23:05:30 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
|
2023-10-31 23:05:30 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
|
2023-10-31 23:05:30 -07:00
|
|
|
});
|