mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

* switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0
58 lines
1.2 KiB
JavaScript
58 lines
1.2 KiB
JavaScript
export class TextExtract {
|
|
|
|
constructor(dom){
|
|
this.dom = dom;
|
|
}
|
|
|
|
async parseText(node, metadata, accum) {
|
|
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
|
const EMPTY_LIST = [];
|
|
const TEXT = "#text";
|
|
const TITLE = "title";
|
|
|
|
const name = node.nodeName.toLowerCase();
|
|
|
|
if (SKIPPED_NODES.includes(name)) {
|
|
return;
|
|
}
|
|
|
|
const children = node.children || EMPTY_LIST;
|
|
|
|
if (name === TEXT) {
|
|
const value = node.nodeValue ? node.nodeValue.trim() : "";
|
|
if (value) {
|
|
accum.push(value);
|
|
}
|
|
} else if (name === TITLE) {
|
|
const title = [];
|
|
|
|
for (let child of children) {
|
|
this.parseText(child, null, title);
|
|
}
|
|
|
|
if (metadata) {
|
|
metadata.title = title.join(" ");
|
|
} else {
|
|
accum.push(title.join(" "));
|
|
}
|
|
} else {
|
|
for (let child of children) {
|
|
this.parseText(child, metadata, accum);
|
|
}
|
|
|
|
if (node.contentDocument) {
|
|
this.parseText(node.contentDocument, null, accum);
|
|
}
|
|
}
|
|
}
|
|
|
|
async parseTextFromDom() {
|
|
const accum = [];
|
|
const metadata = {};
|
|
|
|
this.parseText(this.dom.root, metadata, accum);
|
|
|
|
return accum.join("\n");
|
|
}
|
|
}
|
|
|