mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

* generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
59 lines
1.3 KiB
JavaScript
59 lines
1.3 KiB
JavaScript
class TextExtract {
|
|
|
|
constructor(dom){
|
|
this.dom = dom;
|
|
}
|
|
|
|
async parseText(node, metadata, accum) {
|
|
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
|
const EMPTY_LIST = [];
|
|
const TEXT = "#text";
|
|
const TITLE = "title";
|
|
|
|
const name = node.nodeName.toLowerCase();
|
|
|
|
if (SKIPPED_NODES.includes(name)) {
|
|
return;
|
|
}
|
|
|
|
const children = node.children || EMPTY_LIST;
|
|
|
|
if (name === TEXT) {
|
|
const value = node.nodeValue ? node.nodeValue.trim() : "";
|
|
if (value) {
|
|
accum.push(value);
|
|
}
|
|
} else if (name === TITLE) {
|
|
const title = [];
|
|
|
|
for (let child of children) {
|
|
this.parseText(child, null, title);
|
|
}
|
|
|
|
if (metadata) {
|
|
metadata.title = title.join(" ");
|
|
} else {
|
|
accum.push(title.join(" "));
|
|
}
|
|
} else {
|
|
for (let child of children) {
|
|
this.parseText(child, metadata, accum);
|
|
}
|
|
|
|
if (node.contentDocument) {
|
|
this.parseText(node.contentDocument, null, accum);
|
|
}
|
|
}
|
|
}
|
|
|
|
async parseTextFromDom() {
|
|
const accum = [];
|
|
const metadata = {};
|
|
|
|
this.parseText(this.dom.root, metadata, accum);
|
|
|
|
return accum.join("\n");
|
|
}
|
|
}
|
|
|
|
module.exports = TextExtract;
|