browsertrix-crawler/tests/text-extract.test.js

import fs from "fs";
import child_process from "child_process";

test("check that urn:text and urn:textfinal records are written to WARC", async () => {
  try {
    child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");
  } catch (error) {
    //console.log(new TextDecoder().decode(error));
    console.log(error.stderr);
  }

  const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});

  expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true); 

  expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true); 
});
improved text extraction: (addresses #403) (#404) - use DOMSnapshot.captureSnapshot instead of older DOM.getDocument to get the snapshot (consistent with ArchiveWeb.page) - should be slightly more performant - keep option to use DOM.getDocument - refactor warc resource writing to separate class, used by text extraction and screenshots - write extracted text to WARC files as 'urn:text:<url>' after page loads, similar to screenshots - also store final text to WARC as 'urn:textFinal:<url>' if it is different - cli options: update `--text` to take one more more comma-separated string options `--text to-warc,to-pages,final-to-warc`. For backwards compatibility, support `--text` and `--text true` to be equivalent to `--text to-pages`. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2023-10-31 23:05:30 -07:00			`import fs from "fs";`
			`import child_process from "child_process";`

			`test("check that urn:text and urn:textfinal records are written to WARC", async () => {`
			`try {`
			`child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");`
			`} catch (error) {`
			`//console.log(new TextDecoder().decode(error));`
			`console.log(error.stderr);`
			`}`

			`const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});`

			`expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);`

			`expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);`
			`});`