improved text extraction: (addresses #403) (#404)

- use DOMSnapshot.captureSnapshot instead of older DOM.getDocument to
get the snapshot (consistent with ArchiveWeb.page) - should be slightly
more performant
- keep option to use DOM.getDocument
- refactor warc resource writing to separate class, used by text
extraction and screenshots
- write extracted text to WARC files as 'urn:text:<url>' after page
loads, similar to screenshots
- also store final text to WARC as 'urn:textFinal:<url>' if it is
different
- cli options: update `--text` to take one more more comma-separated
string options `--text to-warc,to-pages,final-to-warc`. For backwards
compatibility, support `--text` and `--text true` to be equivalent to
`--text to-pages`.

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2023-10-31 23:05:30 -07:00 committed by GitHub
parent 064db52272
commit 2aeda56d40
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 225 additions and 84 deletions

View file

@ -12,7 +12,7 @@ import yaml from "js-yaml";
import * as warcio from "warcio";
import { HealthChecker } from "./util/healthcheck.js";
import { TextExtract } from "./util/textextract.js";
import { TextExtractViaSnapshot } from "./util/textextract.js";
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization } from "./util/storage.js";
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
@ -492,11 +492,12 @@ self.__bx_behaviors.selectMainBehavior();
data.title = await page.title();
data.favicon = await this.getFavicon(page, logDetails);
const archiveDir = path.join(this.collDir, "archive");
if (this.params.screenshot) {
if (!data.isHTMLPage) {
logger.debug("Skipping screenshots for non-HTML page", logDetails);
}
const archiveDir = path.join(this.collDir, "archive");
const screenshots = new Screenshots({browser: this.browser, page, url, directory: archiveDir});
if (this.params.screenshot.includes("view")) {
await screenshots.take();
@ -509,9 +510,15 @@ self.__bx_behaviors.selectMainBehavior();
}
}
if (this.params.text && data.isHTMLPage) {
const result = await cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
data.text = await new TextExtract(result).parseTextFromDom();
let textextract = null;
if (data.isHTMLPage) {
textextract = new TextExtractViaSnapshot(cdp, {url, directory: archiveDir});
const {changed, text} = await textextract.extractAndStoreText("text", false, this.params.text.includes("to-warc"));
if (changed && text && this.params.text.includes("to-pages")) {
data.text = text;
}
}
data.loadState = LoadState.EXTRACTION_DONE;
@ -535,6 +542,10 @@ self.__bx_behaviors.selectMainBehavior();
if (res) {
data.loadState = LoadState.BEHAVIORS_DONE;
}
if (textextract && this.params.text.includes("final-to-warc")) {
await textextract.extractAndStoreText("textFinal", true, true);
}
}
}
@ -1420,12 +1431,11 @@ self.__bx_behaviors.selectMainBehavior();
if (createNew) {
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
if (this.params.text) {
header["hasText"] = true;
logger.debug("Text Extraction: Enabled");
header["hasText"] = this.params.text.includes("to-pages");
if (this.params.text.length) {
logger.debug("Text Extraction: " + this.params.text.join(","));
} else {
header["hasText"] = false;
logger.debug("Text Extraction: Disabled");
logger.debug("Text Extraction: None");
}
const header_formatted = JSON.stringify(header).concat("\n");
await this.pagesFH.writeFile(header_formatted);

View file

@ -0,0 +1,18 @@
import fs from "fs";
import child_process from "child_process";
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
try {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");
} catch (error) {
//console.log(new TextDecoder().decode(error));
console.log(error.stderr);
}
const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
});

View file

@ -7,7 +7,7 @@ import { KnownDevices as devices } from "puppeteer-core";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js";
@ -45,12 +45,6 @@ class ArgParser {
default: process.env.CRAWL_ID || os.hostname(),
},
"newContext": {
describe: "Deprecated as of 0.8.0, any values passed will be ignored",
default: null,
type: "string"
},
"waitUntil": {
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
default: "load,networkidle2",
@ -197,8 +191,7 @@ class ArgParser {
"text": {
describe: "If set, extract text to the pages.jsonl file",
type: "boolean",
default: false,
type: "string",
},
"cwd": {
@ -458,28 +451,20 @@ class ArgParser {
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
// can be multiple separate by comma
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
if (typeof argv.waitUntil != "object"){
if (typeof argv.waitUntil != "object") {
argv.waitUntil = argv.waitUntil.split(",");
}
for (const opt of argv.waitUntil) {
if (!WAIT_UNTIL_OPTS.includes(opt)) {
logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
}
// split text options
if (argv.text === "" || argv.text === "true") {
argv.text = "to-pages";
}
// validate screenshot options
if (argv.screenshot) {
const passedScreenshotTypes = argv.screenshot.split(",");
argv.screenshot = [];
passedScreenshotTypes.forEach((element) => {
if (element in screenshotTypes) {
argv.screenshot.push(element);
} else {
logger.warn(`${element} not found in ${screenshotTypes}`);
}
});
}
argv.waitUntil = validateArrayOpts(argv.waitUntil, "waitUntil", WAIT_UNTIL_OPTS);
argv.screenshot = validateArrayOpts(argv.screenshot, "screenshot", Array.from(Object.keys(screenshotTypes)));
argv.text = validateArrayOpts(argv.text, "text", EXTRACT_TEXT_TYPES);
// log options
argv.logging = argv.logging.split(",");
@ -575,6 +560,30 @@ class ArgParser {
}
}
function validateArrayOpts(value, name, allowedValues) {
if (!value) {
return [];
}
if (value instanceof Array) {
return value;
}
if (typeof(value) !== "string") {
return [];
}
const arrayValue = value.split(",");
for (value of arrayValue) {
if (!allowedValues.includes(value)) {
logger.fatal(`Invalid value "${value}" for field "${name}": allowed values are: ${allowedValues.join(",")}`);
}
}
return arrayValue;
}
export function parseArgs(argv) {
return new ArgParser().parseArgs(argv);
}

View file

@ -361,4 +361,5 @@ export const defaultArgs = [
"--no-service-autorun",
"--export-tagged-pdf",
"--component-updater=url-source=http://invalid.dev/",
"--brave-stats-updater-server=url-source=http://invalid.dev/"
];

View file

@ -1,6 +1,8 @@
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
export const BEHAVIOR_LOG_FUNC = "__bx_log";
export const ADD_LINK_FUNC = "__bx_addLink";
export const MAX_DEPTH = 1000000;

View file

@ -1,10 +1,9 @@
import fs from "fs";
import path from "path";
import * as warcio from "warcio";
import sharp from "sharp";
import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger, errJSON } from "./logger.js";
// ============================================================================
export const screenshotTypes = {
@ -26,15 +25,12 @@ export const screenshotTypes = {
};
export class Screenshots {
export class Screenshots extends WARCResourceWriter {
constructor({browser, page, url, date, directory}) {
this.browser = browser;
this.page = page;
this.url = url;
this.directory = directory;
this.warcName = path.join(this.directory, "screenshots.warc.gz");
this.date = date ? date : new Date();
constructor(opts) {
super({...opts, warcName: "screenshots.warc.gz"});
this.browser = opts.browser;
this.page = opts.page;
}
async take(screenshotType="view") {
@ -44,7 +40,7 @@ export class Screenshots {
}
const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options);
await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
await this.writeBufferToWARC(screenshotBuffer, screenshotType, "image/" + options.type);
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
@ -65,32 +61,10 @@ export class Screenshots {
// 16:9 thumbnail
.resize(640, 360)
.toBuffer();
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, "image/" + options.type);
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
}
}
async writeBufferToWARC(screenshotBuffer, screenshotType, imageType) {
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, imageType);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
fs.appendFileSync(this.warcName, warcRecordBuffer);
}
async wrap(buffer, screenshotType="screenshot", imageType="png") {
const warcVersion = "WARC/1.1";
const warcRecordType = "resource";
const warcHeaders = {"Content-Type": `image/${imageType}`};
async function* content() {
yield buffer;
}
let screenshotUrl = `urn:${screenshotType}:` + this.url;
return warcio.WARCRecord.create({
url: screenshotUrl,
date: this.date.toISOString(),
type: warcRecordType,
warcVersion,
warcHeaders}, content());
}
}

View file

@ -1,7 +1,106 @@
export class TextExtract {
import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger } from "./logger.js";
constructor(dom){
this.dom = dom;
// ============================================================================
export class BaseTextExtract extends WARCResourceWriter {
constructor(cdp, opts) {
super({...opts, warcName: "text.warc.gz"});
this.cdp = cdp;
this.lastText = null;
}
async extractAndStoreText(resourceType, ignoreIfMatchesLast = false, saveToWarc = false) {
try {
const text = await this.doGetText();
if (ignoreIfMatchesLast && text === this.lastText) {
this.lastText = this.text;
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
return {changed: false, text};
}
if (saveToWarc) {
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
}
this.lastText = text;
return {changed: true, text};
} catch (e) {
logger.debug("Error extracting text", e, "text");
return {changed: false, text: null};
}
}
async doGetText() {
throw new Error("unimplemented");
}
}
// ============================================================================
export class TextExtractViaSnapshot extends BaseTextExtract {
async doGetText() {
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
return this.parseTextFromDOMSnapshot(result);
}
parseTextFromDOMSnapshot(result) {
const TEXT_NODE = 3;
const ELEMENT_NODE = 1;
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
const {strings, documents} = result;
const accum = [];
for (const doc of documents) {
const nodeValues = doc.nodes.nodeValue;
const nodeNames = doc.nodes.nodeName;
const nodeTypes = doc.nodes.nodeType;
const parentIndex = doc.nodes.parentIndex;
for (let i = 0; i < nodeValues.length; i++) {
if (nodeValues[i] === -1) {
continue;
}
if (nodeTypes[i] === TEXT_NODE) {
const pi = parentIndex[i];
if (pi >= 0 && nodeTypes[pi] === ELEMENT_NODE) {
const name = strings[nodeNames[pi]];
if (!SKIPPED_NODES.includes(name)) {
const value = strings[nodeValues[i]].trim();
if (value) {
accum.push(value);
}
}
}
}
}
return accum.join("\n");
}
}
}
// ============================================================================
export class TextExtractViaDocument extends BaseTextExtract {
async doGetText() {
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
return this.parseTextFromDOM(result);
}
async parseTextFromDom(dom) {
const accum = [];
const metadata = {};
this.parseText(dom.root, metadata, accum);
return accum.join("\n");
}
async parseText(node, metadata, accum) {
@ -45,14 +144,5 @@ export class TextExtract {
}
}
}
async parseTextFromDom() {
const accum = [];
const metadata = {};
this.parseText(this.dom.root, metadata, accum);
return accum.join("\n");
}
}

View file

@ -0,0 +1,37 @@
import fs from "fs";
import path from "path";
import * as warcio from "warcio";
export class WARCResourceWriter
{
constructor({url, directory, date, warcName}) {
this.url = url;
this.directory = directory;
this.warcName = path.join(this.directory, warcName);
this.date = date ? date : new Date();
}
async writeBufferToWARC(contents, resourceType, contentType) {
const warcRecord = await this.wrap(contents, resourceType, contentType);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
fs.appendFileSync(this.warcName, warcRecordBuffer);
}
async wrap(buffer, resourceType, contentType) {
const warcVersion = "WARC/1.1";
const warcRecordType = "resource";
const warcHeaders = {"Content-Type": contentType};
async function* content() {
yield buffer;
}
let resourceUrl = `urn:${resourceType}:${this.url}`;
return warcio.WARCRecord.create({
url: resourceUrl,
date: this.date.toISOString(),
type: warcRecordType,
warcVersion,
warcHeaders
}, content());
}
}