mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
add text extraction (#28)
* add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
This commit is contained in:
parent
0688674f6f
commit
748b0399e9
3 changed files with 98 additions and 8 deletions
|
@ -65,6 +65,10 @@ Options:
|
||||||
[string] [default: "/app/defaultDriver.js"]
|
[string] [default: "/app/defaultDriver.js"]
|
||||||
--generateCDX If set, generate index (CDXJ) for use with pywb after crawl
|
--generateCDX If set, generate index (CDXJ) for use with pywb after crawl
|
||||||
is done [boolean] [default: false]
|
is done [boolean] [default: false]
|
||||||
|
--generateWACZ If set, generate wacz for use with pywb after crawl
|
||||||
|
is done [boolean] [default: false]
|
||||||
|
--text If set, extract the pages full text to be added to the pages.jsonl
|
||||||
|
file [boolean] [default: false]
|
||||||
--cwd Crawl working directory for captures (pywb root). If not
|
--cwd Crawl working directory for captures (pywb root). If not
|
||||||
set, defaults to process.cwd [string] [default: "/crawls"]
|
set, defaults to process.cwd [string] [default: "/crawls"]
|
||||||
```
|
```
|
||||||
|
|
59
behaviors/global/textextract.js
Normal file
59
behaviors/global/textextract.js
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
class TextExtract {
|
||||||
|
|
||||||
|
constructor(dom){
|
||||||
|
this.dom = dom;
|
||||||
|
}
|
||||||
|
|
||||||
|
async parseText(node, metadata, accum) {
|
||||||
|
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
||||||
|
const EMPTY_LIST = [];
|
||||||
|
const TEXT = "#text";
|
||||||
|
const TITLE = "title";
|
||||||
|
|
||||||
|
const name = node.nodeName.toLowerCase();
|
||||||
|
|
||||||
|
if (SKIPPED_NODES.includes(name)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const children = node.children || EMPTY_LIST;
|
||||||
|
|
||||||
|
if (name === TEXT) {
|
||||||
|
const value = node.nodeValue ? node.nodeValue.trim() : '';
|
||||||
|
if (value) {
|
||||||
|
accum.push(value);
|
||||||
|
}
|
||||||
|
} else if (name === TITLE) {
|
||||||
|
const title = [];
|
||||||
|
|
||||||
|
for (let child of children) {
|
||||||
|
this.parseText(child, null, title);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (metadata) {
|
||||||
|
metadata.title = title.join(' ');
|
||||||
|
} else {
|
||||||
|
accum.push(title.join(' '));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (let child of children) {
|
||||||
|
this.parseText(child, metadata, accum);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node.contentDocument) {
|
||||||
|
this.parseText(node.contentDocument, null, accum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async parseTextFromDom() {
|
||||||
|
const accum = [];
|
||||||
|
const metadata = {};
|
||||||
|
|
||||||
|
this.parseText(this.dom.root, metadata, accum);
|
||||||
|
|
||||||
|
return accum.join('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = TextExtract;
|
43
crawler.js
43
crawler.js
|
@ -7,7 +7,7 @@ const path = require("path");
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const Sitemapper = require("sitemapper");
|
const Sitemapper = require("sitemapper");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
|
const TextExtract = require("./behaviors/global/textextract");
|
||||||
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
|
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
|
||||||
|
|
||||||
|
|
||||||
|
@ -222,6 +222,12 @@ class Crawler {
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"text": {
|
||||||
|
describe: "If set, extract text to the pages.jsonl file",
|
||||||
|
type: "boolean",
|
||||||
|
default: false,
|
||||||
|
},
|
||||||
|
|
||||||
"cwd": {
|
"cwd": {
|
||||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||||
type: "string",
|
type: "string",
|
||||||
|
@ -396,7 +402,7 @@ class Crawler {
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawlPage({page, data}) {
|
async crawlPage({page, data}) {
|
||||||
try {
|
try {
|
||||||
if (this.emulateDevice) {
|
if (this.emulateDevice) {
|
||||||
|
@ -407,9 +413,17 @@ class Crawler {
|
||||||
|
|
||||||
// run custom driver here
|
// run custom driver here
|
||||||
await this.driver({page, data, crawler: this});
|
await this.driver({page, data, crawler: this});
|
||||||
|
|
||||||
|
|
||||||
const title = await page.title();
|
const title = await page.title();
|
||||||
this.writePage(data.url, title);
|
var text = ''
|
||||||
|
if (this.params.text){
|
||||||
|
const client = await page.target().createCDPSession();
|
||||||
|
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||||
|
var text = await new TextExtract(result).parseTextFromDom()
|
||||||
|
}
|
||||||
|
|
||||||
|
this.writePage(data.url, title, this.params.text, text);
|
||||||
|
|
||||||
if (bgbehavior) {
|
if (bgbehavior) {
|
||||||
await bgbehavior();
|
await bgbehavior();
|
||||||
|
@ -486,7 +500,6 @@ class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
writeStats() {
|
writeStats() {
|
||||||
if (this.params.statsFilename) {
|
if (this.params.statsFilename) {
|
||||||
const total = this.cluster.allTargetCount;
|
const total = this.cluster.allTargetCount;
|
||||||
|
@ -549,17 +562,31 @@ class Crawler {
|
||||||
// create pages dir if doesn't exist and write pages.jsonl header
|
// create pages dir if doesn't exist and write pages.jsonl header
|
||||||
if (!fs.existsSync(this.pagesDir)) {
|
if (!fs.existsSync(this.pagesDir)) {
|
||||||
fs.mkdirSync(this.pagesDir);
|
fs.mkdirSync(this.pagesDir);
|
||||||
const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n");
|
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
|
||||||
fs.writeFileSync(this.pagesFile, header);
|
if (this.params.text) {
|
||||||
|
console.log("creating pages with full text");
|
||||||
|
header["hasText"] = true
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
console.log("creating pages without full text");
|
||||||
|
header["hasText"] = false
|
||||||
|
}
|
||||||
|
const header_formatted = JSON.stringify(header).concat("\n")
|
||||||
|
fs.writeFileSync(this.pagesFile, header_formatted);
|
||||||
}
|
}
|
||||||
} catch(err) {
|
} catch(err) {
|
||||||
console.log("pages/pages.jsonl creation failed", err);
|
console.log("pages/pages.jsonl creation failed", err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
writePage(url, title){
|
writePage(url, title, text, text_content){
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
const row = {"id": id, "url": url, "title": title};
|
const row = {"id": id, "url": url, "title": title};
|
||||||
|
|
||||||
|
if (text == true){
|
||||||
|
row['text'] = text_content
|
||||||
|
}
|
||||||
|
|
||||||
const processedRow = JSON.stringify(row).concat("\n");
|
const processedRow = JSON.stringify(row).concat("\n");
|
||||||
try {
|
try {
|
||||||
fs.appendFileSync(this.pagesFile, processedRow);
|
fs.appendFileSync(this.pagesFile, processedRow);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue