2022-12-21 12:06:13 -05:00
|
|
|
import fs from "fs";
|
|
|
|
import path from "path";
|
|
|
|
import * as warcio from "warcio";
|
|
|
|
|
2023-03-17 14:24:44 -07:00
|
|
|
import { logger } from "./logger.js";
|
2022-12-21 12:06:13 -05:00
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
export const screenshotTypes = {
|
|
|
|
"view": {
|
|
|
|
type: "png",
|
|
|
|
omitBackground: true,
|
|
|
|
fullPage: false
|
|
|
|
},
|
|
|
|
"thumbnail": {
|
|
|
|
type: "jpeg",
|
|
|
|
omitBackground: true,
|
|
|
|
fullPage: false,
|
|
|
|
quality: 75
|
|
|
|
},
|
|
|
|
"fullPage": {
|
|
|
|
type: "png",
|
|
|
|
omitBackground: true,
|
|
|
|
fullPage: false
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
export class Screenshots {
|
|
|
|
|
|
|
|
constructor({page, url, date, directory}) {
|
|
|
|
this.page = page;
|
|
|
|
this.url = url;
|
|
|
|
this.directory = directory;
|
|
|
|
this.warcName = path.join(this.directory, "screenshots.warc.gz");
|
|
|
|
this.date = date ? date : new Date();
|
|
|
|
}
|
|
|
|
|
|
|
|
async take(screenshotType="view") {
|
|
|
|
try {
|
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
|
|
|
await this.page.setViewportSize({width: 1920, height: 1080});
|
2022-12-21 12:06:13 -05:00
|
|
|
const options = screenshotTypes[screenshotType];
|
|
|
|
const screenshotBuffer = await this.page.screenshot(options);
|
|
|
|
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type);
|
|
|
|
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
|
|
|
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
2022-12-21 12:06:13 -05:00
|
|
|
} catch (e) {
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
|
2022-12-21 12:06:13 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async takeFullPage() {
|
|
|
|
await this.take("fullPage");
|
|
|
|
}
|
|
|
|
|
|
|
|
async takeThumbnail() {
|
|
|
|
await this.take("thumbnail");
|
|
|
|
}
|
|
|
|
|
|
|
|
async wrap(buffer, screenshotType="screenshot", imageType="png") {
|
|
|
|
const warcVersion = "WARC/1.1";
|
|
|
|
const warcRecordType = "resource";
|
|
|
|
const warcHeaders = {"Content-Type": `image/${imageType}`};
|
|
|
|
async function* content() {
|
|
|
|
yield buffer;
|
|
|
|
}
|
|
|
|
let screenshotUrl = `urn:${screenshotType}:` + this.url;
|
|
|
|
return warcio.WARCRecord.create({
|
|
|
|
url: screenshotUrl,
|
|
|
|
date: this.date.toISOString(),
|
|
|
|
type: warcRecordType,
|
|
|
|
warcVersion,
|
|
|
|
warcHeaders}, content());
|
|
|
|
}
|
|
|
|
}
|