improved text extraction: (addresses #403) (#404)

- use DOMSnapshot.captureSnapshot instead of older DOM.getDocument to get the snapshot (consistent with ArchiveWeb.page) - should be slightly more performant - keep option to use DOM.getDocument - refactor warc resource writing to separate class, used by text extraction and screenshots - write extracted text to WARC files as 'urn:text:<url>' after page loads, similar to screenshots - also store final text to WARC as 'urn:textFinal:<url>' if it is different - cli options: update `--text` to take one more more comma-separated string options `--text to-warc,to-pages,final-to-warc`. For backwards compatibility, support `--text` and `--text true` to be equivalent to `--text to-pages`. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2025-10-19 14:33:17 +00:00 · 2023-10-31 23:05:30 -07:00 · 2023-10-31 23:05:30 -07:00 · 2aeda56d40
commit 2aeda56d40
parent 064db52272
8 changed files with 225 additions and 84 deletions
--- a/crawler.js
+++ b/crawler.js
@ -12,7 +12,7 @@ import yaml from "js-yaml";
 import * as warcio from "warcio";

 import { HealthChecker } from "./util/healthcheck.js";
-import { TextExtract } from "./util/textextract.js";
+import { TextExtractViaSnapshot } from "./util/textextract.js";
 import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization } from "./util/storage.js";
 import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
 import { Screenshots } from "./util/screenshots.js";
@ -492,11 +492,12 @@ self.__bx_behaviors.selectMainBehavior();
    data.title = await page.title();
    data.favicon = await this.getFavicon(page, logDetails);

+    const archiveDir = path.join(this.collDir, "archive");
+
    if (this.params.screenshot) {
      if (!data.isHTMLPage) {
        logger.debug("Skipping screenshots for non-HTML page", logDetails);
      }
-      const archiveDir = path.join(this.collDir, "archive");
      const screenshots = new Screenshots({browser: this.browser, page, url, directory: archiveDir});
      if (this.params.screenshot.includes("view")) {
        await screenshots.take();
@ -509,9 +510,15 @@ self.__bx_behaviors.selectMainBehavior();
      }
    }

-    if (this.params.text && data.isHTMLPage) {
-      const result = await cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
-      data.text = await new TextExtract(result).parseTextFromDom();
+    let textextract = null;
+
+    if (data.isHTMLPage) {
+      textextract = new TextExtractViaSnapshot(cdp, {url, directory: archiveDir});
+      const {changed, text} = await textextract.extractAndStoreText("text", false, this.params.text.includes("to-warc"));
+
+      if (changed && text && this.params.text.includes("to-pages")) {
+        data.text = text;
+      }
    }

    data.loadState = LoadState.EXTRACTION_DONE;
@ -535,6 +542,10 @@ self.__bx_behaviors.selectMainBehavior();
        if (res) {
          data.loadState = LoadState.BEHAVIORS_DONE;
        }
+
+        if (textextract && this.params.text.includes("final-to-warc")) {
+          await textextract.extractAndStoreText("textFinal", true, true);
+        }
      }
    }

@ -1420,12 +1431,11 @@ self.__bx_behaviors.selectMainBehavior();

      if (createNew) {
        const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
-        if (this.params.text) {
-          header["hasText"] = true;
-          logger.debug("Text Extraction: Enabled");
+        header["hasText"] = this.params.text.includes("to-pages");
+        if (this.params.text.length) {
+          logger.debug("Text Extraction: " + this.params.text.join(","));
        } else {
-          header["hasText"] = false;
-          logger.debug("Text Extraction: Disabled");
+          logger.debug("Text Extraction: None");
        }
        const header_formatted = JSON.stringify(header).concat("\n");
        await this.pagesFH.writeFile(header_formatted);
--- a/tests/text-extract.test.js
+++ b/tests/text-extract.test.js
@ -0,0 +1,18 @@
+import fs from "fs";
+import child_process from "child_process";
+
+test("check that urn:text and urn:textfinal records are written to WARC", async () => {
+  try {
+    child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");
+  } catch (error) {
+    //console.log(new TextDecoder().decode(error));
+    console.log(error.stderr);
+  }
+
+  const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});
+
+  expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true); 
+
+  expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true); 
+});
+
--- a/util/argParser.js
+++ b/util/argParser.js
@ -7,7 +7,7 @@ import { KnownDevices as devices } from "puppeteer-core";
 import yargs from "yargs";
 import { hideBin } from "yargs/helpers";

-import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
+import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
 import { ScopedSeed } from "./seeds.js";
 import { interpolateFilename } from "./storage.js";
 import { screenshotTypes } from "./screenshots.js";
@ -45,12 +45,6 @@ class ArgParser {
        default: process.env.CRAWL_ID || os.hostname(),
      },

-      "newContext": {
-        describe: "Deprecated as of 0.8.0, any values passed will be ignored",
-        default: null,
-        type: "string"
-      },
-
      "waitUntil": {
        describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
        default: "load,networkidle2",
@ -197,8 +191,7 @@ class ArgParser {

      "text": {
        describe: "If set, extract text to the pages.jsonl file",
-        type: "boolean",
-        default: false,
+        type: "string",
      },

      "cwd": {
@ -458,28 +451,20 @@ class ArgParser {
    // waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
    // can be multiple separate by comma
    // (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
-    if (typeof argv.waitUntil != "object"){
+    if (typeof argv.waitUntil != "object") {
      argv.waitUntil = argv.waitUntil.split(",");
    }

-    for (const opt of argv.waitUntil) {
-      if (!WAIT_UNTIL_OPTS.includes(opt)) {
-        logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
-      }
+    // split text options
+    if (argv.text === "" || argv.text === "true") {
+      argv.text = "to-pages";
    }

-    // validate screenshot options
-    if (argv.screenshot) {
-      const passedScreenshotTypes = argv.screenshot.split(",");
-      argv.screenshot = [];
-      passedScreenshotTypes.forEach((element) => {
-        if (element in screenshotTypes) {
-          argv.screenshot.push(element);
-        } else {
-          logger.warn(`${element} not found in ${screenshotTypes}`);
-        }
-      });
-    }
+    argv.waitUntil = validateArrayOpts(argv.waitUntil, "waitUntil", WAIT_UNTIL_OPTS);
+
+    argv.screenshot = validateArrayOpts(argv.screenshot, "screenshot", Array.from(Object.keys(screenshotTypes)));
+
+    argv.text = validateArrayOpts(argv.text, "text", EXTRACT_TEXT_TYPES);

    // log options
    argv.logging = argv.logging.split(",");
@ -575,6 +560,30 @@ class ArgParser {
  }
 }

+function validateArrayOpts(value, name, allowedValues) {
+  if (!value) {
+    return [];
+  }
+
+  if (value instanceof Array) {
+    return value;
+  }
+
+  if (typeof(value) !== "string") {
+    return [];
+  }
+
+  const arrayValue = value.split(",");
+
+  for (value of arrayValue) {
+    if (!allowedValues.includes(value)) {
+      logger.fatal(`Invalid value "${value}" for field "${name}": allowed values are: ${allowedValues.join(",")}`);
+    }
+  }
+
+  return arrayValue;
+}
+
 export function parseArgs(argv) {
  return new ArgParser().parseArgs(argv);
 }
--- a/util/browser.js
+++ b/util/browser.js
@ -361,4 +361,5 @@ export const defaultArgs = [
  "--no-service-autorun",
  "--export-tagged-pdf",
  "--component-updater=url-source=http://invalid.dev/",
+  "--brave-stats-updater-server=url-source=http://invalid.dev/"
 ];
--- a/util/constants.js
+++ b/util/constants.js
@ -1,6 +1,8 @@

 export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
 export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
+export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
+
 export const BEHAVIOR_LOG_FUNC = "__bx_log";
 export const ADD_LINK_FUNC = "__bx_addLink";
 export const MAX_DEPTH = 1000000;
--- a/util/screenshots.js
+++ b/util/screenshots.js
@ -1,10 +1,9 @@
-import fs from "fs";
-import path from "path";
-import * as warcio from "warcio";
 import sharp from "sharp";

+import { WARCResourceWriter } from "./warcresourcewriter.js";
 import { logger, errJSON } from "./logger.js";

+
 // ============================================================================

 export const screenshotTypes = {
@ -26,15 +25,12 @@ export const screenshotTypes = {
 };


-export class Screenshots {
+export class Screenshots extends WARCResourceWriter {

-  constructor({browser, page, url, date, directory}) {
-    this.browser = browser;
-    this.page = page;
-    this.url = url;
-    this.directory = directory;
-    this.warcName = path.join(this.directory, "screenshots.warc.gz");
-    this.date = date ? date : new Date();
+  constructor(opts) {
+    super({...opts, warcName: "screenshots.warc.gz"});
+    this.browser = opts.browser;
+    this.page = opts.page;
  }

  async take(screenshotType="view") {
@ -44,7 +40,7 @@ export class Screenshots {
      }
      const options = screenshotTypes[screenshotType];
      const screenshotBuffer = await this.page.screenshot(options);
-      await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
+      await this.writeBufferToWARC(screenshotBuffer, screenshotType, "image/" + options.type);
      logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
    } catch (e) {
      logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
@ -65,32 +61,10 @@ export class Screenshots {
        // 16:9 thumbnail
        .resize(640, 360)
        .toBuffer();
-      await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
+      await this.writeBufferToWARC(thumbnailBuffer, screenshotType, "image/" + options.type);
      logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
    } catch (e) {
      logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
    }
  }
-
-  async writeBufferToWARC(screenshotBuffer, screenshotType, imageType) {
-    const warcRecord = await this.wrap(screenshotBuffer, screenshotType, imageType);
-    const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
-    fs.appendFileSync(this.warcName, warcRecordBuffer);
-  }
-
-  async wrap(buffer, screenshotType="screenshot", imageType="png") {
-    const warcVersion = "WARC/1.1";
-    const warcRecordType = "resource";
-    const warcHeaders = {"Content-Type": `image/${imageType}`};
-    async function* content() {
-      yield buffer;
-    }
-    let screenshotUrl = `urn:${screenshotType}:` + this.url;
-    return warcio.WARCRecord.create({
-      url: screenshotUrl,
-      date: this.date.toISOString(),
-      type: warcRecordType,
-      warcVersion,
-      warcHeaders}, content());
-  }
 }
--- a/util/textextract.js
+++ b/util/textextract.js
@ -1,7 +1,106 @@
-export class TextExtract {
+import { WARCResourceWriter } from "./warcresourcewriter.js";
+import { logger } from "./logger.js";

-  constructor(dom){
-    this.dom = dom;
+
+// ============================================================================
+export class BaseTextExtract extends WARCResourceWriter {
+  constructor(cdp, opts) {
+    super({...opts, warcName: "text.warc.gz"});
+    this.cdp = cdp;
+    this.lastText = null;
+  }
+
+  async extractAndStoreText(resourceType, ignoreIfMatchesLast = false, saveToWarc = false) {
+    try {
+      const text = await this.doGetText();
+
+      if (ignoreIfMatchesLast && text === this.lastText) {
+        this.lastText = this.text;
+        logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
+        return {changed: false, text};
+      }
+      if (saveToWarc) {
+        await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
+        logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
+      }
+
+      this.lastText = text;
+      return {changed: true, text};
+    } catch (e) {
+      logger.debug("Error extracting text", e, "text");
+      return {changed: false, text: null};
+    }
+  }
+
+  async doGetText() {
+    throw new Error("unimplemented");
+  }
+}
+
+
+// ============================================================================
+export class TextExtractViaSnapshot extends BaseTextExtract {
+  async doGetText() {
+    const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
+    return this.parseTextFromDOMSnapshot(result);
+  }
+
+  parseTextFromDOMSnapshot(result) {
+    const TEXT_NODE = 3;
+    const ELEMENT_NODE = 1;
+
+    const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
+
+    const {strings, documents} = result;
+
+    const accum = [];
+
+    for (const doc of documents) {
+      const nodeValues = doc.nodes.nodeValue;
+      const nodeNames = doc.nodes.nodeName;
+      const nodeTypes = doc.nodes.nodeType;
+      const parentIndex = doc.nodes.parentIndex;
+
+      for (let i = 0; i < nodeValues.length; i++) {
+        if (nodeValues[i] === -1) {
+          continue;
+        }
+
+        if (nodeTypes[i] === TEXT_NODE) {
+          const pi = parentIndex[i];
+          if (pi >= 0 && nodeTypes[pi] === ELEMENT_NODE) {
+            const name = strings[nodeNames[pi]];
+
+            if (!SKIPPED_NODES.includes(name)) {
+              const value = strings[nodeValues[i]].trim();
+              if (value) {
+                accum.push(value);
+              }
+            }
+          }
+        }
+      }
+
+      return accum.join("\n");
+    }
+  }
+}
+
+
+// ============================================================================
+export class TextExtractViaDocument extends BaseTextExtract {
+  async doGetText() {
+    const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
+    return this.parseTextFromDOM(result);
+  }
+
+  async parseTextFromDom(dom) {
+    const accum = [];
+    const metadata = {};
+
+    this.parseText(dom.root, metadata, accum);
+
+    return accum.join("\n");
  }

  async parseText(node, metadata, accum) {
@ -45,14 +144,5 @@ export class TextExtract {
      } 
    }
  }
-
-  async parseTextFromDom() {
-    const accum = [];
-    const metadata = {};
-
-    this.parseText(this.dom.root, metadata, accum);
-
-    return accum.join("\n");
-  }
 }

--- a/util/warcresourcewriter.js
+++ b/util/warcresourcewriter.js
@ -0,0 +1,37 @@
+import fs from "fs";
+import path from "path";
+import * as warcio from "warcio";
+
+export class WARCResourceWriter
+{
+  constructor({url, directory, date, warcName}) {
+    this.url = url;
+    this.directory = directory;
+    this.warcName = path.join(this.directory, warcName);
+    this.date = date ? date : new Date();
+  }
+
+  async writeBufferToWARC(contents, resourceType, contentType) {
+    const warcRecord = await this.wrap(contents, resourceType, contentType);
+    const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
+    fs.appendFileSync(this.warcName, warcRecordBuffer);
+  }
+
+  async wrap(buffer, resourceType, contentType) {
+    const warcVersion = "WARC/1.1";
+    const warcRecordType = "resource";
+    const warcHeaders = {"Content-Type": contentType};
+    async function* content() {
+      yield buffer;
+    }
+    let resourceUrl = `urn:${resourceType}:${this.url}`;
+
+    return warcio.WARCRecord.create({
+      url: resourceUrl,
+      date: this.date.toISOString(),
+      type: warcRecordType,
+      warcVersion,
+      warcHeaders
+    }, content());
+  }
+}