add text extraction (#28)

* add text extraction via --text flag

* update readme with --text and --generateWACZ flags

Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
This commit is contained in:
Emma Dickson 2021-02-23 16:52:54 -05:00 committed by GitHub
parent 0688674f6f
commit 748b0399e9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 8 deletions

View file

@ -65,6 +65,10 @@ Options:
[string] [default: "/app/defaultDriver.js"] [string] [default: "/app/defaultDriver.js"]
--generateCDX If set, generate index (CDXJ) for use with pywb after crawl --generateCDX If set, generate index (CDXJ) for use with pywb after crawl
is done [boolean] [default: false] is done [boolean] [default: false]
--generateWACZ If set, generate wacz for use with pywb after crawl
is done [boolean] [default: false]
--text If set, extract the pages full text to be added to the pages.jsonl
file [boolean] [default: false]
--cwd Crawl working directory for captures (pywb root). If not --cwd Crawl working directory for captures (pywb root). If not
set, defaults to process.cwd [string] [default: "/crawls"] set, defaults to process.cwd [string] [default: "/crawls"]
``` ```

View file

@ -0,0 +1,59 @@
class TextExtract {
constructor(dom){
this.dom = dom;
}
async parseText(node, metadata, accum) {
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
const EMPTY_LIST = [];
const TEXT = "#text";
const TITLE = "title";
const name = node.nodeName.toLowerCase();
if (SKIPPED_NODES.includes(name)) {
return;
}
const children = node.children || EMPTY_LIST;
if (name === TEXT) {
const value = node.nodeValue ? node.nodeValue.trim() : '';
if (value) {
accum.push(value);
}
} else if (name === TITLE) {
const title = [];
for (let child of children) {
this.parseText(child, null, title);
}
if (metadata) {
metadata.title = title.join(' ');
} else {
accum.push(title.join(' '));
}
} else {
for (let child of children) {
this.parseText(child, metadata, accum);
}
if (node.contentDocument) {
this.parseText(node.contentDocument, null, accum);
}
}
}
async parseTextFromDom() {
const accum = [];
const metadata = {};
this.parseText(this.dom.root, metadata, accum);
return accum.join('\n');
}
}
module.exports = TextExtract;

View file

@ -7,7 +7,7 @@ const path = require("path");
const fs = require("fs"); const fs = require("fs");
const Sitemapper = require("sitemapper"); const Sitemapper = require("sitemapper");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const TextExtract = require("./behaviors/global/textextract");
const BackgroundBehaviors = require("./behaviors/bgbehaviors"); const BackgroundBehaviors = require("./behaviors/bgbehaviors");
@ -222,6 +222,12 @@ class Crawler {
default: false, default: false,
}, },
"text": {
describe: "If set, extract text to the pages.jsonl file",
type: "boolean",
default: false,
},
"cwd": { "cwd": {
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()", describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
type: "string", type: "string",
@ -396,7 +402,7 @@ class Crawler {
process.exit(1); process.exit(1);
} }
} }
async crawlPage({page, data}) { async crawlPage({page, data}) {
try { try {
if (this.emulateDevice) { if (this.emulateDevice) {
@ -407,9 +413,17 @@ class Crawler {
// run custom driver here // run custom driver here
await this.driver({page, data, crawler: this}); await this.driver({page, data, crawler: this});
const title = await page.title(); const title = await page.title();
this.writePage(data.url, title); var text = ''
if (this.params.text){
const client = await page.target().createCDPSession();
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
var text = await new TextExtract(result).parseTextFromDom()
}
this.writePage(data.url, title, this.params.text, text);
if (bgbehavior) { if (bgbehavior) {
await bgbehavior(); await bgbehavior();
@ -486,7 +500,6 @@ class Crawler {
} }
} }
writeStats() { writeStats() {
if (this.params.statsFilename) { if (this.params.statsFilename) {
const total = this.cluster.allTargetCount; const total = this.cluster.allTargetCount;
@ -549,17 +562,31 @@ class Crawler {
// create pages dir if doesn't exist and write pages.jsonl header // create pages dir if doesn't exist and write pages.jsonl header
if (!fs.existsSync(this.pagesDir)) { if (!fs.existsSync(this.pagesDir)) {
fs.mkdirSync(this.pagesDir); fs.mkdirSync(this.pagesDir);
const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n"); const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
fs.writeFileSync(this.pagesFile, header); if (this.params.text) {
console.log("creating pages with full text");
header["hasText"] = true
}
else{
console.log("creating pages without full text");
header["hasText"] = false
}
const header_formatted = JSON.stringify(header).concat("\n")
fs.writeFileSync(this.pagesFile, header_formatted);
} }
} catch(err) { } catch(err) {
console.log("pages/pages.jsonl creation failed", err); console.log("pages/pages.jsonl creation failed", err);
} }
} }
writePage(url, title){ writePage(url, title, text, text_content){
const id = uuidv4(); const id = uuidv4();
const row = {"id": id, "url": url, "title": title}; const row = {"id": id, "url": url, "title": title};
if (text == true){
row['text'] = text_content
}
const processedRow = JSON.stringify(row).concat("\n"); const processedRow = JSON.stringify(row).concat("\n");
try { try {
fs.appendFileSync(this.pagesFile, processedRow); fs.appendFileSync(this.pagesFile, processedRow);