browsertrix-crawler/tests/text-extract.test.js
Emma Segal-Grossman 2a49406df7
Add Prettier to the repo, and format all the files! (#428)
This adds prettier to the repo, and sets up the pre-commit hook to
auto-format as well as lint.
Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed.
2023-11-09 16:11:11 -08:00

22 lines
797 B
JavaScript

import fs from "fs";
import child_process from "child_process";
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc",
);
} catch (error) {
//console.log(new TextDecoder().decode(error));
console.log(error.stderr);
}
const data = fs.readFileSync(
"test-crawls/collections/text-extract/indexes/index.cdxj",
{ encoding: "utf-8" },
);
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
});