browsertrix-crawler/util/textextract.js
Ilya Kreymer 2aeda56d40
improved text extraction: (addresses #403) (#404)
- use DOMSnapshot.captureSnapshot instead of older DOM.getDocument to
get the snapshot (consistent with ArchiveWeb.page) - should be slightly
more performant
- keep option to use DOM.getDocument
- refactor warc resource writing to separate class, used by text
extraction and screenshots
- write extracted text to WARC files as 'urn:text:<url>' after page
loads, similar to screenshots
- also store final text to WARC as 'urn:textFinal:<url>' if it is
different
- cli options: update `--text` to take one more more comma-separated
string options `--text to-warc,to-pages,final-to-warc`. For backwards
compatibility, support `--text` and `--text true` to be equivalent to
`--text to-pages`.

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-10-31 23:05:30 -07:00

148 lines
4 KiB
JavaScript

import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger } from "./logger.js";
// ============================================================================
export class BaseTextExtract extends WARCResourceWriter {
constructor(cdp, opts) {
super({...opts, warcName: "text.warc.gz"});
this.cdp = cdp;
this.lastText = null;
}
async extractAndStoreText(resourceType, ignoreIfMatchesLast = false, saveToWarc = false) {
try {
const text = await this.doGetText();
if (ignoreIfMatchesLast && text === this.lastText) {
this.lastText = this.text;
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
return {changed: false, text};
}
if (saveToWarc) {
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
}
this.lastText = text;
return {changed: true, text};
} catch (e) {
logger.debug("Error extracting text", e, "text");
return {changed: false, text: null};
}
}
async doGetText() {
throw new Error("unimplemented");
}
}
// ============================================================================
export class TextExtractViaSnapshot extends BaseTextExtract {
async doGetText() {
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
return this.parseTextFromDOMSnapshot(result);
}
parseTextFromDOMSnapshot(result) {
const TEXT_NODE = 3;
const ELEMENT_NODE = 1;
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
const {strings, documents} = result;
const accum = [];
for (const doc of documents) {
const nodeValues = doc.nodes.nodeValue;
const nodeNames = doc.nodes.nodeName;
const nodeTypes = doc.nodes.nodeType;
const parentIndex = doc.nodes.parentIndex;
for (let i = 0; i < nodeValues.length; i++) {
if (nodeValues[i] === -1) {
continue;
}
if (nodeTypes[i] === TEXT_NODE) {
const pi = parentIndex[i];
if (pi >= 0 && nodeTypes[pi] === ELEMENT_NODE) {
const name = strings[nodeNames[pi]];
if (!SKIPPED_NODES.includes(name)) {
const value = strings[nodeValues[i]].trim();
if (value) {
accum.push(value);
}
}
}
}
}
return accum.join("\n");
}
}
}
// ============================================================================
export class TextExtractViaDocument extends BaseTextExtract {
async doGetText() {
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
return this.parseTextFromDOM(result);
}
async parseTextFromDom(dom) {
const accum = [];
const metadata = {};
this.parseText(dom.root, metadata, accum);
return accum.join("\n");
}
async parseText(node, metadata, accum) {
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
const EMPTY_LIST = [];
const TEXT = "#text";
const TITLE = "title";
const name = node.nodeName.toLowerCase();
if (SKIPPED_NODES.includes(name)) {
return;
}
const children = node.children || EMPTY_LIST;
if (name === TEXT) {
const value = node.nodeValue ? node.nodeValue.trim() : "";
if (value) {
accum.push(value);
}
} else if (name === TITLE) {
const title = [];
for (let child of children) {
this.parseText(child, null, title);
}
if (metadata) {
metadata.title = title.join(" ");
} else {
accum.push(title.join(" "));
}
} else {
for (let child of children) {
this.parseText(child, metadata, accum);
}
if (node.contentDocument) {
this.parseText(node.contentDocument, null, accum);
}
}
}
}