browsertrix-crawler/util/textextract.js

class TextExtract {

  constructor(dom){
    this.dom = dom;
  }

  async parseText(node, metadata, accum) {
    const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
    const EMPTY_LIST = [];
    const TEXT = "#text";
    const TITLE = "title";

    const name = node.nodeName.toLowerCase();

    if (SKIPPED_NODES.includes(name)) {
      return;
    }

    const children = node.children || EMPTY_LIST;

    if (name === TEXT) {
      const value = node.nodeValue ? node.nodeValue.trim() : "";
      if (value) {
        accum.push(value);
      }
    } else if (name === TITLE) {
      const title = [];

      for (let child of children) {
        this.parseText(child, null, title);
      }

      if (metadata) {
        metadata.title = title.join(" ");
      } else {
        accum.push(title.join(" "));
      }
    } else {
      for (let child of children) {
        this.parseText(child, metadata, accum);
      }

      if (node.contentDocument) {
        this.parseText(node.contentDocument, null, accum);
      }
    }
  }

  async parseTextFromDom() {
    const accum = [];
    const metadata = {};

    this.parseText(this.dom.root, metadata, accum);

    return accum.join("\n");
  }
}

module.exports = TextExtract;