browsertrix-crawler/util/warcwriter.js
Ilya Kreymer 877d9f5b44
Use new browser-based archiving mechanism instead of pywb proxy (#424)
Major refactoring of Browsertrix Crawler to native capture network traffic to WARC files
via the Chrome Debug Protocol (CDP). Allows for more flexibility and accuracy when dealing
with HTTP/2.x sites and avoids a MITM proxy. Addresses #343 

Changes include:
- Recorder class for capture CDP network traffic for each page.
- Handling requests from service workers via matching active frames, skipping unrelated requests outside the page (from background pages, etc..)
- WARC writing support via TS-based warcio.js library.
- Generates single WARC file per worker (still need to add size rollover).
- Request interception via Fetch.requestPaused
- Rule-based rewriting response support (via wabac.js), using Fetch.getResponseBody() / Fetch.fulfillRequest()
- Streaming responses via three methods: inline response fetch via Fetch.takeResponseBodyAsStream, 
async loading via browser network stack with Network.loadNetworkResource() and node-based async fetch
via fetch()
- Direct async fetch() capture of non-HTML URLs
- Awaiting for all requests to finish before moving on to next page, upto page timeout.
- Experimental: generate CDXJ on-the-fly as WARC is being written (not yet in use).
- removed pywb, using cdxj-indexer for --generateCDX option.
2023-11-07 21:38:50 -08:00

111 lines
2.6 KiB
JavaScript

import fs from "fs";
import path from "path";
import { CDXIndexer } from "warcio";
import { WARCSerializer } from "warcio/node";
import { logger, errJSON } from "./logger.js";
// =================================================================
export class WARCWriter
{
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails}) {
this.archivesDir = archivesDir;
this.tempCdxDir = tempCdxDir;
this.filename = filename;
this.gzip = gzip;
this.logDetails = logDetails;
this.offset = 0;
this.recordLength = 0;
if (this.tempCdxDir) {
this.indexer = new CDXIndexer({format: "cdxj"});
} else {
this.indexer = null;
}
this.fh = null;
this.cdxFH = null;
}
async initFH() {
if (!this.fh) {
this.fh = fs.createWriteStream(path.join(this.archivesDir, this.filename));
}
if (!this.cdxFH && this.tempCdxDir) {
this.cdxFH = fs.createWriteStream(path.join(this.tempCdxDir, this.filename + ".cdx"));
}
}
async writeRecordPair(responseRecord, requestRecord, responseSerializer = null) {
const opts = {gzip: this.gzip};
if (!responseSerializer) {
responseSerializer = new WARCSerializer(responseRecord, opts);
}
await this.initFH();
this.recordLength = await this._writeRecord(responseRecord, responseSerializer);
this._writeCDX(responseRecord);
const requestSerializer = new WARCSerializer(requestRecord, opts);
this.recordLength = await this._writeRecord(requestRecord, requestSerializer);
this._writeCDX(requestRecord);
}
async _writeRecord(record, serializer) {
let total = 0;
const url = record.warcTargetURI;
for await (const chunk of serializer) {
total += chunk.length;
try {
this.fh.write(chunk);
} catch (e) {
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
}
}
return total;
}
_writeCDX(record) {
if (this.indexer) {
const cdx = this.indexer.indexRecord(record, this, this.filename);
if (this.indexer && this.cdxFH && cdx) {
this.indexer.write(cdx, this.cdxFH);
}
}
this.offset += this.recordLength;
}
async flush() {
if (this.fh) {
await streamFinish(this.fh);
this.fh = null;
}
if (this.cdxFH) {
this._writeCDX(null);
await streamFinish(this.cdxFH);
this.cdxFH = null;
}
}
}
// =================================================================
export function streamFinish(fh) {
const p = new Promise(resolve => {
fh.once("finish", () => resolve());
});
fh.end();
return p;
}