mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

- reduced memory usage, avoids memory leak issues caused by using playwright (see #298) - browser: split Browser into Browser and BaseBrowser - browser: puppeteer-specific functions added to Browser for additional flexibility if need to change again later - browser: use defaultArgs from playwright - browser: attempt to recover if initial target is gone - logging: add debug logging from process.memoryUsage() after every page - request interception: use priorities for cooperative request interception - request interception: move to setupPage() to run once per page, enable if any of blockrules, adblockrules or originOverrides are used - request interception: fix originOverrides enabled check, fix to work with catch-all request interception - default args: set --waitUntil back to 'load,networkidle2' - Update README with changes for puppeteer - tests: fix extra hops depth test to ensure more than one page crawled --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
79 lines
2.2 KiB
JavaScript
79 lines
2.2 KiB
JavaScript
import fs from "fs";
|
|
import path from "path";
|
|
import * as warcio from "warcio";
|
|
|
|
import { logger } from "./logger.js";
|
|
|
|
// ============================================================================
|
|
|
|
export const screenshotTypes = {
|
|
"view": {
|
|
type: "png",
|
|
omitBackground: true,
|
|
fullPage: false
|
|
},
|
|
"thumbnail": {
|
|
type: "jpeg",
|
|
omitBackground: true,
|
|
fullPage: false,
|
|
quality: 75
|
|
},
|
|
"fullPage": {
|
|
type: "png",
|
|
omitBackground: true,
|
|
fullPage: true
|
|
}
|
|
};
|
|
|
|
|
|
export class Screenshots {
|
|
|
|
constructor({browser, page, url, date, directory}) {
|
|
this.browser = browser;
|
|
this.page = page;
|
|
this.url = url;
|
|
this.directory = directory;
|
|
this.warcName = path.join(this.directory, "screenshots.warc.gz");
|
|
this.date = date ? date : new Date();
|
|
}
|
|
|
|
async take(screenshotType="view") {
|
|
try {
|
|
if (screenshotType !== "fullPage") {
|
|
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
|
}
|
|
const options = screenshotTypes[screenshotType];
|
|
const screenshotBuffer = await this.page.screenshot(options);
|
|
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type);
|
|
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
|
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
|
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
|
} catch (e) {
|
|
logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
|
|
}
|
|
}
|
|
|
|
async takeFullPage() {
|
|
await this.take("fullPage");
|
|
}
|
|
|
|
async takeThumbnail() {
|
|
await this.take("thumbnail");
|
|
}
|
|
|
|
async wrap(buffer, screenshotType="screenshot", imageType="png") {
|
|
const warcVersion = "WARC/1.1";
|
|
const warcRecordType = "resource";
|
|
const warcHeaders = {"Content-Type": `image/${imageType}`};
|
|
async function* content() {
|
|
yield buffer;
|
|
}
|
|
let screenshotUrl = `urn:${screenshotType}:` + this.url;
|
|
return warcio.WARCRecord.create({
|
|
url: screenshotUrl,
|
|
date: this.date.toISOString(),
|
|
type: warcRecordType,
|
|
warcVersion,
|
|
warcHeaders}, content());
|
|
}
|
|
}
|