mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

* Create an argument parser class * move constants, arg parser to separate files in utils/* * ensure yaml config overriden by command-line args * yaml loading work: - simplify yaml config by using yargs.config option - move all option parsing to argParser, simply expose parseArgs - export constants directly - add lint to util/* files * support inline 'seeds' in cmdline and yaml config tests: - add test for crawl config, ensuring seeds crawled + wacz created - add test to ensure cmdline overrides yaml config * scope fix: empty scope implies only fixed list, use '.*' for any scope * lint fix * update readme with yaml config info * allow 'url' and 'seeds' if both provided Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> Co-authored-by: emmadickson <emma.dickson@artsymail.com>
59 lines
1.3 KiB
JavaScript
59 lines
1.3 KiB
JavaScript
class TextExtract {
|
|
|
|
constructor(dom){
|
|
this.dom = dom;
|
|
}
|
|
|
|
async parseText(node, metadata, accum) {
|
|
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
|
const EMPTY_LIST = [];
|
|
const TEXT = "#text";
|
|
const TITLE = "title";
|
|
|
|
const name = node.nodeName.toLowerCase();
|
|
|
|
if (SKIPPED_NODES.includes(name)) {
|
|
return;
|
|
}
|
|
|
|
const children = node.children || EMPTY_LIST;
|
|
|
|
if (name === TEXT) {
|
|
const value = node.nodeValue ? node.nodeValue.trim() : "";
|
|
if (value) {
|
|
accum.push(value);
|
|
}
|
|
} else if (name === TITLE) {
|
|
const title = [];
|
|
|
|
for (let child of children) {
|
|
this.parseText(child, null, title);
|
|
}
|
|
|
|
if (metadata) {
|
|
metadata.title = title.join(" ");
|
|
} else {
|
|
accum.push(title.join(" "));
|
|
}
|
|
} else {
|
|
for (let child of children) {
|
|
this.parseText(child, metadata, accum);
|
|
}
|
|
|
|
if (node.contentDocument) {
|
|
this.parseText(node.contentDocument, null, accum);
|
|
}
|
|
}
|
|
}
|
|
|
|
async parseTextFromDom() {
|
|
const accum = [];
|
|
const metadata = {};
|
|
|
|
this.parseText(this.dom.root, metadata, accum);
|
|
|
|
return accum.join("\n");
|
|
}
|
|
}
|
|
|
|
module.exports = TextExtract;
|