mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

- use DOMSnapshot.captureSnapshot instead of older DOM.getDocument to get the snapshot (consistent with ArchiveWeb.page) - should be slightly more performant - keep option to use DOM.getDocument - refactor warc resource writing to separate class, used by text extraction and screenshots - write extracted text to WARC files as 'urn:text:<url>' after page loads, similar to screenshots - also store final text to WARC as 'urn:textFinal:<url>' if it is different - cli options: update `--text` to take one more more comma-separated string options `--text to-warc,to-pages,final-to-warc`. For backwards compatibility, support `--text` and `--text true` to be equivalent to `--text to-pages`. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
148 lines
4 KiB
JavaScript
148 lines
4 KiB
JavaScript
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
|
import { logger } from "./logger.js";
|
|
|
|
|
|
// ============================================================================
|
|
export class BaseTextExtract extends WARCResourceWriter {
|
|
constructor(cdp, opts) {
|
|
super({...opts, warcName: "text.warc.gz"});
|
|
this.cdp = cdp;
|
|
this.lastText = null;
|
|
}
|
|
|
|
async extractAndStoreText(resourceType, ignoreIfMatchesLast = false, saveToWarc = false) {
|
|
try {
|
|
const text = await this.doGetText();
|
|
|
|
if (ignoreIfMatchesLast && text === this.lastText) {
|
|
this.lastText = this.text;
|
|
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
|
|
return {changed: false, text};
|
|
}
|
|
if (saveToWarc) {
|
|
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
|
|
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
|
|
}
|
|
|
|
this.lastText = text;
|
|
return {changed: true, text};
|
|
} catch (e) {
|
|
logger.debug("Error extracting text", e, "text");
|
|
return {changed: false, text: null};
|
|
}
|
|
}
|
|
|
|
async doGetText() {
|
|
throw new Error("unimplemented");
|
|
}
|
|
}
|
|
|
|
|
|
// ============================================================================
|
|
export class TextExtractViaSnapshot extends BaseTextExtract {
|
|
async doGetText() {
|
|
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
|
|
return this.parseTextFromDOMSnapshot(result);
|
|
}
|
|
|
|
parseTextFromDOMSnapshot(result) {
|
|
const TEXT_NODE = 3;
|
|
const ELEMENT_NODE = 1;
|
|
|
|
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
|
|
|
|
const {strings, documents} = result;
|
|
|
|
const accum = [];
|
|
|
|
for (const doc of documents) {
|
|
const nodeValues = doc.nodes.nodeValue;
|
|
const nodeNames = doc.nodes.nodeName;
|
|
const nodeTypes = doc.nodes.nodeType;
|
|
const parentIndex = doc.nodes.parentIndex;
|
|
|
|
for (let i = 0; i < nodeValues.length; i++) {
|
|
if (nodeValues[i] === -1) {
|
|
continue;
|
|
}
|
|
|
|
if (nodeTypes[i] === TEXT_NODE) {
|
|
const pi = parentIndex[i];
|
|
if (pi >= 0 && nodeTypes[pi] === ELEMENT_NODE) {
|
|
const name = strings[nodeNames[pi]];
|
|
|
|
if (!SKIPPED_NODES.includes(name)) {
|
|
const value = strings[nodeValues[i]].trim();
|
|
if (value) {
|
|
accum.push(value);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return accum.join("\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// ============================================================================
|
|
export class TextExtractViaDocument extends BaseTextExtract {
|
|
async doGetText() {
|
|
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
|
return this.parseTextFromDOM(result);
|
|
}
|
|
|
|
async parseTextFromDom(dom) {
|
|
const accum = [];
|
|
const metadata = {};
|
|
|
|
this.parseText(dom.root, metadata, accum);
|
|
|
|
return accum.join("\n");
|
|
}
|
|
|
|
async parseText(node, metadata, accum) {
|
|
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
|
const EMPTY_LIST = [];
|
|
const TEXT = "#text";
|
|
const TITLE = "title";
|
|
|
|
const name = node.nodeName.toLowerCase();
|
|
|
|
if (SKIPPED_NODES.includes(name)) {
|
|
return;
|
|
}
|
|
|
|
const children = node.children || EMPTY_LIST;
|
|
|
|
if (name === TEXT) {
|
|
const value = node.nodeValue ? node.nodeValue.trim() : "";
|
|
if (value) {
|
|
accum.push(value);
|
|
}
|
|
} else if (name === TITLE) {
|
|
const title = [];
|
|
|
|
for (let child of children) {
|
|
this.parseText(child, null, title);
|
|
}
|
|
|
|
if (metadata) {
|
|
metadata.title = title.join(" ");
|
|
} else {
|
|
accum.push(title.join(" "));
|
|
}
|
|
} else {
|
|
for (let child of children) {
|
|
this.parseText(child, metadata, accum);
|
|
}
|
|
|
|
if (node.contentDocument) {
|
|
this.parseText(node.contentDocument, null, accum);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|