browsertrix-crawler/util/textextract.js
Ilya Kreymer 3ebe511b32 Arg Parsing Refactor + Support for YAML Config Support (take 2!) (#59)
* Create an argument parser class

* move constants, arg parser to separate files in utils/*

* ensure yaml config overriden by command-line args

* yaml loading work:
- simplify yaml config by using yargs.config option
- move all option parsing to argParser, simply expose parseArgs
- export constants directly
- add lint to util/* files

* support inline 'seeds' in cmdline and yaml config

tests:
- add test for crawl config, ensuring seeds crawled + wacz created
- add test to ensure cmdline overrides yaml config

* scope fix: empty scope implies only fixed list, use '.*' for any scope

* lint fix

* update readme with yaml config info

* allow 'url' and 'seeds' if both provided

Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
Co-authored-by: emmadickson <emma.dickson@artsymail.com>
2021-06-23 19:45:40 -07:00

59 lines
1.3 KiB
JavaScript

class TextExtract {
constructor(dom){
this.dom = dom;
}
async parseText(node, metadata, accum) {
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
const EMPTY_LIST = [];
const TEXT = "#text";
const TITLE = "title";
const name = node.nodeName.toLowerCase();
if (SKIPPED_NODES.includes(name)) {
return;
}
const children = node.children || EMPTY_LIST;
if (name === TEXT) {
const value = node.nodeValue ? node.nodeValue.trim() : "";
if (value) {
accum.push(value);
}
} else if (name === TITLE) {
const title = [];
for (let child of children) {
this.parseText(child, null, title);
}
if (metadata) {
metadata.title = title.join(" ");
} else {
accum.push(title.join(" "));
}
} else {
for (let child of children) {
this.parseText(child, metadata, accum);
}
if (node.contentDocument) {
this.parseText(node.contentDocument, null, accum);
}
}
}
async parseTextFromDom() {
const accum = [];
const metadata = {};
this.parseText(this.dom.root, metadata, accum);
return accum.join("\n");
}
}
module.exports = TextExtract;