browsertrix-crawler/textextract.js

60 lines
1.3 KiB
JavaScript
Raw Normal View History

class TextExtract {
constructor(dom){
this.dom = dom;
}
async parseText(node, metadata, accum) {
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
const EMPTY_LIST = [];
const TEXT = "#text";
const TITLE = "title";
const name = node.nodeName.toLowerCase();
if (SKIPPED_NODES.includes(name)) {
return;
}
const children = node.children || EMPTY_LIST;
if (name === TEXT) {
const value = node.nodeValue ? node.nodeValue.trim() : "";
if (value) {
accum.push(value);
}
} else if (name === TITLE) {
const title = [];
for (let child of children) {
this.parseText(child, null, title);
}
if (metadata) {
metadata.title = title.join(" ");
} else {
accum.push(title.join(" "));
}
} else {
for (let child of children) {
this.parseText(child, metadata, accum);
}
if (node.contentDocument) {
this.parseText(node.contentDocument, null, accum);
}
}
}
async parseTextFromDom() {
const accum = [];
const metadata = {};
this.parseText(this.dom.root, metadata, accum);
return accum.join("\n");
}
}
module.exports = TextExtract;