browsertrix-crawler/util/screenshots.js

import fs from "fs";
import path from "path";
import * as warcio from "warcio";

import { logger } from "./logger.js";

// ============================================================================

export const screenshotTypes = {
  "view": {
    type: "png",
    omitBackground: true,
    fullPage: false
  },
  "thumbnail": {
    type: "jpeg",
    omitBackground: true,
    fullPage: false,
    quality: 75
  },
  "fullPage": {
    type: "png",
    omitBackground: true,
    fullPage: false
  }
};


export class Screenshots {

  constructor({page, url, date, directory}) {
    this.page = page;
    this.url = url;
    this.directory = directory;
    this.warcName = path.join(this.directory, "screenshots.warc.gz");
    this.date = date ? date : new Date();
  }

  async take(screenshotType="view") {
    try {
      await this.page.setViewportSize({width: 1920, height: 1080});
      const options = screenshotTypes[screenshotType];
      const screenshotBuffer = await this.page.screenshot(options);
      const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type);
      const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
      fs.appendFileSync(this.warcName, warcRecordBuffer);
      logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
    } catch (e) {
      logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
    }
  }

  async takeFullPage() {
    await this.take("fullPage");
  }

  async takeThumbnail() {
    await this.take("thumbnail");
  }

  async wrap(buffer, screenshotType="screenshot", imageType="png") {
    const warcVersion = "WARC/1.1";
    const warcRecordType = "resource";
    const warcHeaders = {"Content-Type": `image/${imageType}`};
    async function* content() {
      yield buffer;
    }
    let screenshotUrl = `urn:${screenshotType}:` + this.url;
    return warcio.WARCRecord.create({
      url: screenshotUrl,
      date: this.date.toISOString(),
      type: warcRecordType,
      warcVersion,
      warcHeaders}, content());
  }
}
Add screenshot functionality (#188) * Add screenshot and thumbnail functionality Introduces a --screenshot CLI option, which takes a comma-separated list of screenshot types: view,fullPage,thumbnail. In addition, this commit: - Adds '--experimental-global-webcrypto' to ensure webcrypto is available in node - Deprecates newContext, instead always using page context for 1 worker and window context for >1 worker * Separate screenshotTypes into exported const Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Air.local> 2022-12-21 12:06:13 -05:00			`import fs from "fs";`
			`import path from "path";`
			`import * as warcio from "warcio";`

Logger cleanup (#254) * logging: convert logger to a singleton to simplify use * add logger to create-login-profile.js 2023-03-17 14:24:44 -07:00			`import { logger } from "./logger.js";`
Add screenshot functionality (#188) * Add screenshot and thumbnail functionality Introduces a --screenshot CLI option, which takes a comma-separated list of screenshot types: view,fullPage,thumbnail. In addition, this commit: - Adds '--experimental-global-webcrypto' to ensure webcrypto is available in node - Deprecates newContext, instead always using page context for 1 worker and window context for >1 worker * Separate screenshotTypes into exported const Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Air.local> 2022-12-21 12:06:13 -05:00
			`// ============================================================================`

			`export const screenshotTypes = {`
			`"view": {`
			`type: "png",`
			`omitBackground: true,`
			`fullPage: false`
			`},`
			`"thumbnail": {`
			`type: "jpeg",`
			`omitBackground: true,`
			`fullPage: false,`
			`quality: 75`
			`},`
			`"fullPage": {`
			`type: "png",`
			`omitBackground: true,`
			`fullPage: false`
			`}`
			`};`


			`export class Screenshots {`

			`constructor({page, url, date, directory}) {`
			`this.page = page;`
			`this.url = url;`
			`this.directory = directory;`
			`this.warcName = path.join(this.directory, "screenshots.warc.gz");`
			`this.date = date ? date : new Date();`
			`}`

			`async take(screenshotType="view") {`
			`try {`
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2023-03-17 12:50:32 -07:00			`await this.page.setViewportSize({width: 1920, height: 1080});`
Add screenshot functionality (#188) * Add screenshot and thumbnail functionality Introduces a --screenshot CLI option, which takes a comma-separated list of screenshot types: view,fullPage,thumbnail. In addition, this commit: - Adds '--experimental-global-webcrypto' to ensure webcrypto is available in node - Deprecates newContext, instead always using page context for 1 worker and window context for >1 worker * Separate screenshotTypes into exported const Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Air.local> 2022-12-21 12:06:13 -05:00			`const options = screenshotTypes[screenshotType];`
			`const screenshotBuffer = await this.page.screenshot(options);`
			`const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type);`
			`const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});`
			`fs.appendFileSync(this.warcName, warcRecordBuffer);`
Implement improved json-l logging - Add Logger class with methods for info, error, warn, debug, fatal - Add context, timestamp, and details fields to log entries - Log messages as JSON Lines - Replace puppeteer-cluster stats with custom stats implementation - Log behaviors by default - Amend argParser to reflect logging changes - Capture and log stdout/stderr from awaited child_processes - Modify tests to use webrecorder.net to avoid timeouts 2022-12-15 12:38:41 -05:00			logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
Add screenshot functionality (#188) * Add screenshot and thumbnail functionality Introduces a --screenshot CLI option, which takes a comma-separated list of screenshot types: view,fullPage,thumbnail. In addition, this commit: - Adds '--experimental-global-webcrypto' to ensure webcrypto is available in node - Deprecates newContext, instead always using page context for 1 worker and window context for >1 worker * Separate screenshotTypes into exported const Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Air.local> 2022-12-21 12:06:13 -05:00			`} catch (e) {`
Implement improved json-l logging - Add Logger class with methods for info, error, warn, debug, fatal - Add context, timestamp, and details fields to log entries - Log messages as JSON Lines - Replace puppeteer-cluster stats with custom stats implementation - Log behaviors by default - Amend argParser to reflect logging changes - Capture and log stdout/stderr from awaited child_processes - Modify tests to use webrecorder.net to avoid timeouts 2022-12-15 12:38:41 -05:00			logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
Add screenshot functionality (#188) * Add screenshot and thumbnail functionality Introduces a --screenshot CLI option, which takes a comma-separated list of screenshot types: view,fullPage,thumbnail. In addition, this commit: - Adds '--experimental-global-webcrypto' to ensure webcrypto is available in node - Deprecates newContext, instead always using page context for 1 worker and window context for >1 worker * Separate screenshotTypes into exported const Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Air.local> 2022-12-21 12:06:13 -05:00			`}`
			`}`

			`async takeFullPage() {`
			`await this.take("fullPage");`
			`}`

			`async takeThumbnail() {`
			`await this.take("thumbnail");`
			`}`

			`async wrap(buffer, screenshotType="screenshot", imageType="png") {`
			`const warcVersion = "WARC/1.1";`
			`const warcRecordType = "resource";`
			const warcHeaders = {"Content-Type": `image/${imageType}`};
			`async function* content() {`
			`yield buffer;`
			`}`
			let screenshotUrl = `urn:${screenshotType}:` + this.url;
			`return warcio.WARCRecord.create({`
			`url: screenshotUrl,`
			`date: this.date.toISOString(),`
			`type: warcRecordType,`
			`warcVersion,`
			`warcHeaders}, content());`
			`}`
			`}`