mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
add text extraction (#28)
* add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
This commit is contained in:
parent
0688674f6f
commit
748b0399e9
3 changed files with 98 additions and 8 deletions
|
@ -65,6 +65,10 @@ Options:
|
|||
[string] [default: "/app/defaultDriver.js"]
|
||||
--generateCDX If set, generate index (CDXJ) for use with pywb after crawl
|
||||
is done [boolean] [default: false]
|
||||
--generateWACZ If set, generate wacz for use with pywb after crawl
|
||||
is done [boolean] [default: false]
|
||||
--text If set, extract the pages full text to be added to the pages.jsonl
|
||||
file [boolean] [default: false]
|
||||
--cwd Crawl working directory for captures (pywb root). If not
|
||||
set, defaults to process.cwd [string] [default: "/crawls"]
|
||||
```
|
||||
|
|
59
behaviors/global/textextract.js
Normal file
59
behaviors/global/textextract.js
Normal file
|
@ -0,0 +1,59 @@
|
|||
class TextExtract {
|
||||
|
||||
constructor(dom){
|
||||
this.dom = dom;
|
||||
}
|
||||
|
||||
async parseText(node, metadata, accum) {
|
||||
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
||||
const EMPTY_LIST = [];
|
||||
const TEXT = "#text";
|
||||
const TITLE = "title";
|
||||
|
||||
const name = node.nodeName.toLowerCase();
|
||||
|
||||
if (SKIPPED_NODES.includes(name)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const children = node.children || EMPTY_LIST;
|
||||
|
||||
if (name === TEXT) {
|
||||
const value = node.nodeValue ? node.nodeValue.trim() : '';
|
||||
if (value) {
|
||||
accum.push(value);
|
||||
}
|
||||
} else if (name === TITLE) {
|
||||
const title = [];
|
||||
|
||||
for (let child of children) {
|
||||
this.parseText(child, null, title);
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
metadata.title = title.join(' ');
|
||||
} else {
|
||||
accum.push(title.join(' '));
|
||||
}
|
||||
} else {
|
||||
for (let child of children) {
|
||||
this.parseText(child, metadata, accum);
|
||||
}
|
||||
|
||||
if (node.contentDocument) {
|
||||
this.parseText(node.contentDocument, null, accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async parseTextFromDom() {
|
||||
const accum = [];
|
||||
const metadata = {};
|
||||
|
||||
this.parseText(this.dom.root, metadata, accum);
|
||||
|
||||
return accum.join('\n');
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = TextExtract;
|
43
crawler.js
43
crawler.js
|
@ -7,7 +7,7 @@ const path = require("path");
|
|||
const fs = require("fs");
|
||||
const Sitemapper = require("sitemapper");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
|
||||
const TextExtract = require("./behaviors/global/textextract");
|
||||
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
|
||||
|
||||
|
||||
|
@ -222,6 +222,12 @@ class Crawler {
|
|||
default: false,
|
||||
},
|
||||
|
||||
"text": {
|
||||
describe: "If set, extract text to the pages.jsonl file",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"cwd": {
|
||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||
type: "string",
|
||||
|
@ -396,7 +402,7 @@ class Crawler {
|
|||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async crawlPage({page, data}) {
|
||||
try {
|
||||
if (this.emulateDevice) {
|
||||
|
@ -407,9 +413,17 @@ class Crawler {
|
|||
|
||||
// run custom driver here
|
||||
await this.driver({page, data, crawler: this});
|
||||
|
||||
|
||||
|
||||
const title = await page.title();
|
||||
this.writePage(data.url, title);
|
||||
var text = ''
|
||||
if (this.params.text){
|
||||
const client = await page.target().createCDPSession();
|
||||
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||
var text = await new TextExtract(result).parseTextFromDom()
|
||||
}
|
||||
|
||||
this.writePage(data.url, title, this.params.text, text);
|
||||
|
||||
if (bgbehavior) {
|
||||
await bgbehavior();
|
||||
|
@ -486,7 +500,6 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
writeStats() {
|
||||
if (this.params.statsFilename) {
|
||||
const total = this.cluster.allTargetCount;
|
||||
|
@ -549,17 +562,31 @@ class Crawler {
|
|||
// create pages dir if doesn't exist and write pages.jsonl header
|
||||
if (!fs.existsSync(this.pagesDir)) {
|
||||
fs.mkdirSync(this.pagesDir);
|
||||
const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n");
|
||||
fs.writeFileSync(this.pagesFile, header);
|
||||
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
|
||||
if (this.params.text) {
|
||||
console.log("creating pages with full text");
|
||||
header["hasText"] = true
|
||||
}
|
||||
else{
|
||||
console.log("creating pages without full text");
|
||||
header["hasText"] = false
|
||||
}
|
||||
const header_formatted = JSON.stringify(header).concat("\n")
|
||||
fs.writeFileSync(this.pagesFile, header_formatted);
|
||||
}
|
||||
} catch(err) {
|
||||
console.log("pages/pages.jsonl creation failed", err);
|
||||
}
|
||||
}
|
||||
|
||||
writePage(url, title){
|
||||
writePage(url, title, text, text_content){
|
||||
const id = uuidv4();
|
||||
const row = {"id": id, "url": url, "title": title};
|
||||
|
||||
if (text == true){
|
||||
row['text'] = text_content
|
||||
}
|
||||
|
||||
const processedRow = JSON.stringify(row).concat("\n");
|
||||
try {
|
||||
fs.appendFileSync(this.pagesFile, processedRow);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue