browsertrix-crawler/textextract.js

class TextExtract {
  
  constructor(dom){
    this.dom = dom;
  }

  async parseText(node, metadata, accum) {
    const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
    const EMPTY_LIST = [];
    const TEXT = "#text";
    const TITLE = "title";

    const name = node.nodeName.toLowerCase();
      
    if (SKIPPED_NODES.includes(name)) {
      return;
    }

    const children = node.children || EMPTY_LIST;

    if (name === TEXT) {
      const value = node.nodeValue ? node.nodeValue.trim() : "";
      if (value) {
        accum.push(value);
      }
    } else if (name === TITLE) {
      const title = [];

      for (let child of children) {
        this.parseText(child, null, title);
      }

      if (metadata) {
        metadata.title = title.join(" ");
      } else {
        accum.push(title.join(" "));
      }
    } else {
      for (let child of children) {
        this.parseText(child, metadata, accum);
      }

      if (node.contentDocument) { 
        this.parseText(node.contentDocument, null, accum);
      } 
    }
  }

  async parseTextFromDom() {
    const accum = [];
    const metadata = {};

    this.parseText(this.dom.root, metadata, accum);

    return accum.join("\n");
  }
}

module.exports = TextExtract;
add text extraction (#28) * add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-02-23 16:52:54 -05:00			`class TextExtract {`

			`constructor(dom){`
			`this.dom = dom;`
			`}`

			`async parseText(node, metadata, accum) {`
			`const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];`
			`const EMPTY_LIST = [];`
			`const TEXT = "#text";`
			`const TITLE = "title";`

			`const name = node.nodeName.toLowerCase();`

			`if (SKIPPED_NODES.includes(name)) {`
			`return;`
			`}`

			`const children = node.children \|\| EMPTY_LIST;`

			`if (name === TEXT) {`
Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33) * generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-03-31 13:41:27 -04:00			`const value = node.nodeValue ? node.nodeValue.trim() : "";`
add text extraction (#28) * add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-02-23 16:52:54 -05:00			`if (value) {`
			`accum.push(value);`
			`}`
			`} else if (name === TITLE) {`
			`const title = [];`

			`for (let child of children) {`
			`this.parseText(child, null, title);`
			`}`

			`if (metadata) {`
Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33) * generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-03-31 13:41:27 -04:00			`metadata.title = title.join(" ");`
add text extraction (#28) * add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-02-23 16:52:54 -05:00			`} else {`
Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33) * generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-03-31 13:41:27 -04:00			`accum.push(title.join(" "));`
add text extraction (#28) * add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-02-23 16:52:54 -05:00			`}`
			`} else {`
			`for (let child of children) {`
			`this.parseText(child, metadata, accum);`
			`}`

			`if (node.contentDocument) {`
Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33) * generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-03-31 13:41:27 -04:00			`this.parseText(node.contentDocument, null, accum);`
add text extraction (#28) * add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-02-23 16:52:54 -05:00			`}`
			`}`
			`}`

			`async parseTextFromDom() {`
			`const accum = [];`
			`const metadata = {};`

			`this.parseText(this.dom.root, metadata, accum);`

Create --combineWARC flag that combines generated warcs into a single warc upto rollover size (#33) * generates combined WARCs in collection root directory with suffix `_0.warc`, `_1.warc`, etc.. * each combined WARC limited by the size in `--rolloverSize`, if exceeds a new WARC is created, otherwise appended to previous WARC. * add test for --combineWARC flag * add improved lint rules Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-03-31 13:41:27 -04:00			`return accum.join("\n");`
add text extraction (#28) * add text extraction via --text flag * update readme with --text and --generateWACZ flags Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-02-23 16:52:54 -05:00			`}`
			`}`

			`module.exports = TextExtract;`