mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
- use DOMSnapshot.captureSnapshot instead of older DOM.getDocument to get the snapshot (consistent with ArchiveWeb.page) - should be slightly more performant - keep option to use DOM.getDocument - refactor warc resource writing to separate class, used by text extraction and screenshots - write extracted text to WARC files as 'urn:text:<url>' after page loads, similar to screenshots - also store final text to WARC as 'urn:textFinal:<url>' if it is different - cli options: update `--text` to take one more more comma-separated string options `--text to-warc,to-pages,final-to-warc`. For backwards compatibility, support `--text` and `--text true` to be equivalent to `--text to-pages`. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
064db52272
commit
2aeda56d40
8 changed files with 225 additions and 84 deletions
30
crawler.js
30
crawler.js
|
@ -12,7 +12,7 @@ import yaml from "js-yaml";
|
|||
import * as warcio from "warcio";
|
||||
|
||||
import { HealthChecker } from "./util/healthcheck.js";
|
||||
import { TextExtract } from "./util/textextract.js";
|
||||
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
||||
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization } from "./util/storage.js";
|
||||
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
|
||||
import { Screenshots } from "./util/screenshots.js";
|
||||
|
@ -492,11 +492,12 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
data.title = await page.title();
|
||||
data.favicon = await this.getFavicon(page, logDetails);
|
||||
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
|
||||
if (this.params.screenshot) {
|
||||
if (!data.isHTMLPage) {
|
||||
logger.debug("Skipping screenshots for non-HTML page", logDetails);
|
||||
}
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
const screenshots = new Screenshots({browser: this.browser, page, url, directory: archiveDir});
|
||||
if (this.params.screenshot.includes("view")) {
|
||||
await screenshots.take();
|
||||
|
@ -509,9 +510,15 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
if (this.params.text && data.isHTMLPage) {
|
||||
const result = await cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||
data.text = await new TextExtract(result).parseTextFromDom();
|
||||
let textextract = null;
|
||||
|
||||
if (data.isHTMLPage) {
|
||||
textextract = new TextExtractViaSnapshot(cdp, {url, directory: archiveDir});
|
||||
const {changed, text} = await textextract.extractAndStoreText("text", false, this.params.text.includes("to-warc"));
|
||||
|
||||
if (changed && text && this.params.text.includes("to-pages")) {
|
||||
data.text = text;
|
||||
}
|
||||
}
|
||||
|
||||
data.loadState = LoadState.EXTRACTION_DONE;
|
||||
|
@ -535,6 +542,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (res) {
|
||||
data.loadState = LoadState.BEHAVIORS_DONE;
|
||||
}
|
||||
|
||||
if (textextract && this.params.text.includes("final-to-warc")) {
|
||||
await textextract.extractAndStoreText("textFinal", true, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1420,12 +1431,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (createNew) {
|
||||
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
|
||||
if (this.params.text) {
|
||||
header["hasText"] = true;
|
||||
logger.debug("Text Extraction: Enabled");
|
||||
header["hasText"] = this.params.text.includes("to-pages");
|
||||
if (this.params.text.length) {
|
||||
logger.debug("Text Extraction: " + this.params.text.join(","));
|
||||
} else {
|
||||
header["hasText"] = false;
|
||||
logger.debug("Text Extraction: Disabled");
|
||||
logger.debug("Text Extraction: None");
|
||||
}
|
||||
const header_formatted = JSON.stringify(header).concat("\n");
|
||||
await this.pagesFH.writeFile(header_formatted);
|
||||
|
|
18
tests/text-extract.test.js
Normal file
18
tests/text-extract.test.js
Normal file
|
@ -0,0 +1,18 @@
|
|||
import fs from "fs";
|
||||
import child_process from "child_process";
|
||||
|
||||
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
|
||||
try {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");
|
||||
} catch (error) {
|
||||
//console.log(new TextDecoder().decode(error));
|
||||
console.log(error.stderr);
|
||||
}
|
||||
|
||||
const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});
|
||||
|
||||
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
|
||||
|
||||
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
|
||||
});
|
||||
|
|
@ -7,7 +7,7 @@ import { KnownDevices as devices } from "puppeteer-core";
|
|||
import yargs from "yargs";
|
||||
import { hideBin } from "yargs/helpers";
|
||||
|
||||
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
|
||||
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
import { screenshotTypes } from "./screenshots.js";
|
||||
|
@ -45,12 +45,6 @@ class ArgParser {
|
|||
default: process.env.CRAWL_ID || os.hostname(),
|
||||
},
|
||||
|
||||
"newContext": {
|
||||
describe: "Deprecated as of 0.8.0, any values passed will be ignored",
|
||||
default: null,
|
||||
type: "string"
|
||||
},
|
||||
|
||||
"waitUntil": {
|
||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
||||
default: "load,networkidle2",
|
||||
|
@ -197,8 +191,7 @@ class ArgParser {
|
|||
|
||||
"text": {
|
||||
describe: "If set, extract text to the pages.jsonl file",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"cwd": {
|
||||
|
@ -458,28 +451,20 @@ class ArgParser {
|
|||
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
|
||||
// can be multiple separate by comma
|
||||
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
||||
if (typeof argv.waitUntil != "object"){
|
||||
if (typeof argv.waitUntil != "object") {
|
||||
argv.waitUntil = argv.waitUntil.split(",");
|
||||
}
|
||||
|
||||
for (const opt of argv.waitUntil) {
|
||||
if (!WAIT_UNTIL_OPTS.includes(opt)) {
|
||||
logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
||||
}
|
||||
// split text options
|
||||
if (argv.text === "" || argv.text === "true") {
|
||||
argv.text = "to-pages";
|
||||
}
|
||||
|
||||
// validate screenshot options
|
||||
if (argv.screenshot) {
|
||||
const passedScreenshotTypes = argv.screenshot.split(",");
|
||||
argv.screenshot = [];
|
||||
passedScreenshotTypes.forEach((element) => {
|
||||
if (element in screenshotTypes) {
|
||||
argv.screenshot.push(element);
|
||||
} else {
|
||||
logger.warn(`${element} not found in ${screenshotTypes}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
argv.waitUntil = validateArrayOpts(argv.waitUntil, "waitUntil", WAIT_UNTIL_OPTS);
|
||||
|
||||
argv.screenshot = validateArrayOpts(argv.screenshot, "screenshot", Array.from(Object.keys(screenshotTypes)));
|
||||
|
||||
argv.text = validateArrayOpts(argv.text, "text", EXTRACT_TEXT_TYPES);
|
||||
|
||||
// log options
|
||||
argv.logging = argv.logging.split(",");
|
||||
|
@ -575,6 +560,30 @@ class ArgParser {
|
|||
}
|
||||
}
|
||||
|
||||
function validateArrayOpts(value, name, allowedValues) {
|
||||
if (!value) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (value instanceof Array) {
|
||||
return value;
|
||||
}
|
||||
|
||||
if (typeof(value) !== "string") {
|
||||
return [];
|
||||
}
|
||||
|
||||
const arrayValue = value.split(",");
|
||||
|
||||
for (value of arrayValue) {
|
||||
if (!allowedValues.includes(value)) {
|
||||
logger.fatal(`Invalid value "${value}" for field "${name}": allowed values are: ${allowedValues.join(",")}`);
|
||||
}
|
||||
}
|
||||
|
||||
return arrayValue;
|
||||
}
|
||||
|
||||
export function parseArgs(argv) {
|
||||
return new ArgParser().parseArgs(argv);
|
||||
}
|
||||
|
|
|
@ -361,4 +361,5 @@ export const defaultArgs = [
|
|||
"--no-service-autorun",
|
||||
"--export-tagged-pdf",
|
||||
"--component-updater=url-source=http://invalid.dev/",
|
||||
"--brave-stats-updater-server=url-source=http://invalid.dev/"
|
||||
];
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
||||
|
||||
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||
export const ADD_LINK_FUNC = "__bx_addLink";
|
||||
export const MAX_DEPTH = 1000000;
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import fs from "fs";
|
||||
import path from "path";
|
||||
import * as warcio from "warcio";
|
||||
import sharp from "sharp";
|
||||
|
||||
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
||||
import { logger, errJSON } from "./logger.js";
|
||||
|
||||
|
||||
// ============================================================================
|
||||
|
||||
export const screenshotTypes = {
|
||||
|
@ -26,15 +25,12 @@ export const screenshotTypes = {
|
|||
};
|
||||
|
||||
|
||||
export class Screenshots {
|
||||
export class Screenshots extends WARCResourceWriter {
|
||||
|
||||
constructor({browser, page, url, date, directory}) {
|
||||
this.browser = browser;
|
||||
this.page = page;
|
||||
this.url = url;
|
||||
this.directory = directory;
|
||||
this.warcName = path.join(this.directory, "screenshots.warc.gz");
|
||||
this.date = date ? date : new Date();
|
||||
constructor(opts) {
|
||||
super({...opts, warcName: "screenshots.warc.gz"});
|
||||
this.browser = opts.browser;
|
||||
this.page = opts.page;
|
||||
}
|
||||
|
||||
async take(screenshotType="view") {
|
||||
|
@ -44,7 +40,7 @@ export class Screenshots {
|
|||
}
|
||||
const options = screenshotTypes[screenshotType];
|
||||
const screenshotBuffer = await this.page.screenshot(options);
|
||||
await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
|
||||
await this.writeBufferToWARC(screenshotBuffer, screenshotType, "image/" + options.type);
|
||||
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
|
@ -65,32 +61,10 @@ export class Screenshots {
|
|||
// 16:9 thumbnail
|
||||
.resize(640, 360)
|
||||
.toBuffer();
|
||||
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
|
||||
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, "image/" + options.type);
|
||||
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
}
|
||||
}
|
||||
|
||||
async writeBufferToWARC(screenshotBuffer, screenshotType, imageType) {
|
||||
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, imageType);
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||
}
|
||||
|
||||
async wrap(buffer, screenshotType="screenshot", imageType="png") {
|
||||
const warcVersion = "WARC/1.1";
|
||||
const warcRecordType = "resource";
|
||||
const warcHeaders = {"Content-Type": `image/${imageType}`};
|
||||
async function* content() {
|
||||
yield buffer;
|
||||
}
|
||||
let screenshotUrl = `urn:${screenshotType}:` + this.url;
|
||||
return warcio.WARCRecord.create({
|
||||
url: screenshotUrl,
|
||||
date: this.date.toISOString(),
|
||||
type: warcRecordType,
|
||||
warcVersion,
|
||||
warcHeaders}, content());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,106 @@
|
|||
export class TextExtract {
|
||||
import { WARCResourceWriter } from "./warcresourcewriter.js";
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
constructor(dom){
|
||||
this.dom = dom;
|
||||
|
||||
// ============================================================================
|
||||
export class BaseTextExtract extends WARCResourceWriter {
|
||||
constructor(cdp, opts) {
|
||||
super({...opts, warcName: "text.warc.gz"});
|
||||
this.cdp = cdp;
|
||||
this.lastText = null;
|
||||
}
|
||||
|
||||
async extractAndStoreText(resourceType, ignoreIfMatchesLast = false, saveToWarc = false) {
|
||||
try {
|
||||
const text = await this.doGetText();
|
||||
|
||||
if (ignoreIfMatchesLast && text === this.lastText) {
|
||||
this.lastText = this.text;
|
||||
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
|
||||
return {changed: false, text};
|
||||
}
|
||||
if (saveToWarc) {
|
||||
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
|
||||
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
|
||||
}
|
||||
|
||||
this.lastText = text;
|
||||
return {changed: true, text};
|
||||
} catch (e) {
|
||||
logger.debug("Error extracting text", e, "text");
|
||||
return {changed: false, text: null};
|
||||
}
|
||||
}
|
||||
|
||||
async doGetText() {
|
||||
throw new Error("unimplemented");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||
async doGetText() {
|
||||
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
|
||||
return this.parseTextFromDOMSnapshot(result);
|
||||
}
|
||||
|
||||
parseTextFromDOMSnapshot(result) {
|
||||
const TEXT_NODE = 3;
|
||||
const ELEMENT_NODE = 1;
|
||||
|
||||
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
|
||||
|
||||
const {strings, documents} = result;
|
||||
|
||||
const accum = [];
|
||||
|
||||
for (const doc of documents) {
|
||||
const nodeValues = doc.nodes.nodeValue;
|
||||
const nodeNames = doc.nodes.nodeName;
|
||||
const nodeTypes = doc.nodes.nodeType;
|
||||
const parentIndex = doc.nodes.parentIndex;
|
||||
|
||||
for (let i = 0; i < nodeValues.length; i++) {
|
||||
if (nodeValues[i] === -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nodeTypes[i] === TEXT_NODE) {
|
||||
const pi = parentIndex[i];
|
||||
if (pi >= 0 && nodeTypes[pi] === ELEMENT_NODE) {
|
||||
const name = strings[nodeNames[pi]];
|
||||
|
||||
if (!SKIPPED_NODES.includes(name)) {
|
||||
const value = strings[nodeValues[i]].trim();
|
||||
if (value) {
|
||||
accum.push(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return accum.join("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class TextExtractViaDocument extends BaseTextExtract {
|
||||
async doGetText() {
|
||||
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||
return this.parseTextFromDOM(result);
|
||||
}
|
||||
|
||||
async parseTextFromDom(dom) {
|
||||
const accum = [];
|
||||
const metadata = {};
|
||||
|
||||
this.parseText(dom.root, metadata, accum);
|
||||
|
||||
return accum.join("\n");
|
||||
}
|
||||
|
||||
async parseText(node, metadata, accum) {
|
||||
|
@ -45,14 +144,5 @@ export class TextExtract {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
async parseTextFromDom() {
|
||||
const accum = [];
|
||||
const metadata = {};
|
||||
|
||||
this.parseText(this.dom.root, metadata, accum);
|
||||
|
||||
return accum.join("\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
37
util/warcresourcewriter.js
Normal file
37
util/warcresourcewriter.js
Normal file
|
@ -0,0 +1,37 @@
|
|||
import fs from "fs";
|
||||
import path from "path";
|
||||
import * as warcio from "warcio";
|
||||
|
||||
export class WARCResourceWriter
|
||||
{
|
||||
constructor({url, directory, date, warcName}) {
|
||||
this.url = url;
|
||||
this.directory = directory;
|
||||
this.warcName = path.join(this.directory, warcName);
|
||||
this.date = date ? date : new Date();
|
||||
}
|
||||
|
||||
async writeBufferToWARC(contents, resourceType, contentType) {
|
||||
const warcRecord = await this.wrap(contents, resourceType, contentType);
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||
}
|
||||
|
||||
async wrap(buffer, resourceType, contentType) {
|
||||
const warcVersion = "WARC/1.1";
|
||||
const warcRecordType = "resource";
|
||||
const warcHeaders = {"Content-Type": contentType};
|
||||
async function* content() {
|
||||
yield buffer;
|
||||
}
|
||||
let resourceUrl = `urn:${resourceType}:${this.url}`;
|
||||
|
||||
return warcio.WARCRecord.create({
|
||||
url: resourceUrl,
|
||||
date: this.date.toISOString(),
|
||||
type: warcRecordType,
|
||||
warcVersion,
|
||||
warcHeaders
|
||||
}, content());
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue