browsertrix-crawler/util/textextract.js
Ilya Kreymer 277314f2de Convert to ESM (#179)
* switch base image to chrome/chromium 105 with node 18.x
* convert all source to esm for node 18.x, remove unneeded node-fetch dependency
* ci: use node 18.x, update to latest actions
* tests: convert to esm, run with --experimental-vm-modules
* tests: set higher default timeout (90s) for all tests
* tests: rename driver test fixture to .mjs for loading in jest
* bump to 0.8.0
2022-11-15 18:30:27 -08:00

58 lines
1.2 KiB
JavaScript

export class TextExtract {
constructor(dom){
this.dom = dom;
}
async parseText(node, metadata, accum) {
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
const EMPTY_LIST = [];
const TEXT = "#text";
const TITLE = "title";
const name = node.nodeName.toLowerCase();
if (SKIPPED_NODES.includes(name)) {
return;
}
const children = node.children || EMPTY_LIST;
if (name === TEXT) {
const value = node.nodeValue ? node.nodeValue.trim() : "";
if (value) {
accum.push(value);
}
} else if (name === TITLE) {
const title = [];
for (let child of children) {
this.parseText(child, null, title);
}
if (metadata) {
metadata.title = title.join(" ");
} else {
accum.push(title.join(" "));
}
} else {
for (let child of children) {
this.parseText(child, metadata, accum);
}
if (node.contentDocument) {
this.parseText(node.contentDocument, null, accum);
}
}
}
async parseTextFromDom() {
const accum = [];
const metadata = {};
this.parseText(this.dom.root, metadata, accum);
return accum.join("\n");
}
}