add text extraction (#28)

* add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
2025-10-19 06:23:16 +00:00 · 2021-02-23 16:52:54 -05:00 · 2021-02-23 16:52:54 -05:00 · 748b0399e9
commit 748b0399e9
parent 0688674f6f
3 changed files with 98 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -65,6 +65,10 @@ Options:
                                     [string] [default: "/app/defaultDriver.js"]
      --generateCDX  If set, generate index (CDXJ) for use with pywb after crawl
                     is done                          [boolean] [default: false]
+      --generateWACZ If set, generate wacz for use with pywb after crawl
+                      is done                          [boolean] [default: false]
+      --text         If set, extract the pages full text to be added to the pages.jsonl  
+                      file                         [boolean] [default: false]
      --cwd          Crawl working directory for captures (pywb root). If not
                     set, defaults to process.cwd  [string] [default: "/crawls"]
 ```
--- a/behaviors/global/textextract.js
+++ b/behaviors/global/textextract.js
@ -0,0 +1,59 @@
+class TextExtract {
+  
+  constructor(dom){
+    this.dom = dom;
+  }
+
+  async parseText(node, metadata, accum) {
+    const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
+    const EMPTY_LIST = [];
+    const TEXT = "#text";
+    const TITLE = "title";
+
+    const name = node.nodeName.toLowerCase();
+      
+    if (SKIPPED_NODES.includes(name)) {
+      return;
+    }
+
+    const children = node.children || EMPTY_LIST;
+
+    if (name === TEXT) {
+      const value = node.nodeValue ? node.nodeValue.trim() : '';
+      if (value) {
+        accum.push(value);
+      }
+    } else if (name === TITLE) {
+      const title = [];
+
+      for (let child of children) {
+        this.parseText(child, null, title);
+      }
+
+      if (metadata) {
+        metadata.title = title.join(' ');
+      } else {
+        accum.push(title.join(' '));
+      }
+    } else {
+      for (let child of children) {
+        this.parseText(child, metadata, accum);
+      }
+
+      if (node.contentDocument) { 
+      this.parseText(node.contentDocument, null, accum);
+      } 
+    }
+  }
+
+  async parseTextFromDom() {
+    const accum = [];
+    const metadata = {};
+
+    this.parseText(this.dom.root, metadata, accum);
+
+    return accum.join('\n');
+  }
+}
+
+module.exports = TextExtract;
--- a/crawler.js
+++ b/crawler.js
@ -7,7 +7,7 @@ const path = require("path");
 const fs = require("fs");
 const Sitemapper = require("sitemapper");
 const { v4: uuidv4 } = require("uuid");
-
+const TextExtract = require("./behaviors/global/textextract");
 const BackgroundBehaviors = require("./behaviors/bgbehaviors");


@ -222,6 +222,12 @@ class Crawler {
        default: false,
      },
      
+      "text": {
+        describe: "If set, extract text to the pages.jsonl file",
+        type: "boolean",
+        default: false,
+      },
+      
      "cwd": {
        describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
        type: "string",
@ -396,7 +402,7 @@ class Crawler {
      process.exit(1);
    }
  }
-
+  
  async crawlPage({page, data}) {
    try {
      if (this.emulateDevice) {
@ -407,9 +413,17 @@ class Crawler {

      // run custom driver here
      await this.driver({page, data, crawler: this});
-
+      
+      
      const title = await page.title();
-      this.writePage(data.url, title);
+      var text = ''
+      if (this.params.text){
+        const client = await page.target().createCDPSession();
+        const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
+        var text = await new TextExtract(result).parseTextFromDom()
+      }
+    
+      this.writePage(data.url, title, this.params.text, text);

      if (bgbehavior) {
        await bgbehavior();
@ -486,7 +500,6 @@ class Crawler {
    }
  }

-
  writeStats() {
    if (this.params.statsFilename) {
      const total = this.cluster.allTargetCount;
@ -549,17 +562,31 @@ class Crawler {
      // create pages dir if doesn't exist and write pages.jsonl header
      if (!fs.existsSync(this.pagesDir)) {
        fs.mkdirSync(this.pagesDir);
-        const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n");
-        fs.writeFileSync(this.pagesFile, header);
+        const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
+        if (this.params.text) {
+          console.log("creating pages with full text");
+          header["hasText"] = true
+        }
+        else{
+          console.log("creating pages without full text");
+          header["hasText"] = false
+        }
+        const header_formatted = JSON.stringify(header).concat("\n")
+        fs.writeFileSync(this.pagesFile, header_formatted);
      }
    } catch(err) {
      console.log("pages/pages.jsonl creation failed", err);
    }
  }

-  writePage(url, title){
+  writePage(url, title, text, text_content){
    const id = uuidv4();
    const row = {"id": id, "url": url, "title": title};
+
+    if (text == true){
+      row['text'] = text_content
+    }
+    
    const processedRow = JSON.stringify(row).concat("\n");
    try {
      fs.appendFileSync(this.pagesFile, processedRow);