add text extraction (#28)

* add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
2025-10-19 14:33:17 +00:00 · 2021-02-23 16:52:54 -05:00 · 2021-02-23 16:52:54 -05:00 · 748b0399e9
commit 748b0399e9
parent 0688674f6f
3 changed files with 98 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -65,6 +65,10 @@ Options:
                                     [string] [default: "/app/defaultDriver.js"]
      --generateCDX  If set, generate index (CDXJ) for use with pywb after crawl
                     is done                          [boolean] [default: false]
      --generateWACZ If set, generate wacz for use with pywb after crawl
                      is done                          [boolean] [default: false]
      --text         If set, extract the pages full text to be added to the pages.jsonl  
                      file                         [boolean] [default: false]
      --cwd          Crawl working directory for captures (pywb root). If not
                     set, defaults to process.cwd  [string] [default: "/crawls"]
 ```
--- a/behaviors/global/textextract.js
+++ b/behaviors/global/textextract.js
@ -0,0 +1,59 @@
 class TextExtract {
  constructor(dom){
    this.dom = dom;
  }
  async parseText(node, metadata, accum) {
    const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
    const EMPTY_LIST = [];
    const TEXT = "#text";
    const TITLE = "title";
    const name = node.nodeName.toLowerCase();
    if (SKIPPED_NODES.includes(name)) {
      return;
    }
    const children = node.children || EMPTY_LIST;
    if (name === TEXT) {
      const value = node.nodeValue ? node.nodeValue.trim() : '';
      if (value) {
        accum.push(value);
      }
    } else if (name === TITLE) {
      const title = [];
      for (let child of children) {
        this.parseText(child, null, title);
      }
      if (metadata) {
        metadata.title = title.join(' ');
      } else {
        accum.push(title.join(' '));
      }
    } else {
      for (let child of children) {
        this.parseText(child, metadata, accum);
      }
      if (node.contentDocument) { 
      this.parseText(node.contentDocument, null, accum);
      } 
    }
  }
  async parseTextFromDom() {
    const accum = [];
    const metadata = {};
    this.parseText(this.dom.root, metadata, accum);
    return accum.join('\n');
  }
 }
 module.exports = TextExtract;
--- a/crawler.js
+++ b/crawler.js
@ -7,7 +7,7 @@ const path = require("path");
 const fs = require("fs");
 const Sitemapper = require("sitemapper");
 const { v4: uuidv4 } = require("uuid");
-
+const TextExtract = require("./behaviors/global/textextract");
 const BackgroundBehaviors = require("./behaviors/bgbehaviors");
@ -222,6 +222,12 @@ class Crawler {
        default: false,
      },
      "text": {
        describe: "If set, extract text to the pages.jsonl file",
        type: "boolean",
        default: false,
      },
      "cwd": {
        describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
        type: "string",
@ -396,7 +402,7 @@ class Crawler {
      process.exit(1);
    }
  }
-
+  
  async crawlPage({page, data}) {
    try {
      if (this.emulateDevice) {
@ -407,9 +413,17 @@ class Crawler {
      // run custom driver here
      await this.driver({page, data, crawler: this});
-
+      
      const title = await page.title();
-      this.writePage(data.url, title);
+      var text = ''
      if (this.params.text){
        const client = await page.target().createCDPSession();
        const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
        var text = await new TextExtract(result).parseTextFromDom()
      }
      this.writePage(data.url, title, this.params.text, text);
      if (bgbehavior) {
        await bgbehavior();
@ -486,7 +500,6 @@ class Crawler {
    }
  }
  writeStats() {
    if (this.params.statsFilename) {
      const total = this.cluster.allTargetCount;
@ -549,17 +562,31 @@ class Crawler {
      // create pages dir if doesn't exist and write pages.jsonl header
      if (!fs.existsSync(this.pagesDir)) {
        fs.mkdirSync(this.pagesDir);
-        const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n");
+        const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
-        fs.writeFileSync(this.pagesFile, header);
+        if (this.params.text) {
          console.log("creating pages with full text");
          header["hasText"] = true
        }
        else{
          console.log("creating pages without full text");
          header["hasText"] = false
        }
        const header_formatted = JSON.stringify(header).concat("\n")
        fs.writeFileSync(this.pagesFile, header_formatted);
      }
    } catch(err) {
      console.log("pages/pages.jsonl creation failed", err);
    }
  }
-  writePage(url, title){
+  writePage(url, title, text, text_content){
    const id = uuidv4();
    const row = {"id": id, "url": url, "title": title};
    if (text == true){
      row['text'] = text_content
    }
    const processedRow = JSON.stringify(row).concat("\n");
    try {
      fs.appendFileSync(this.pagesFile, processedRow);