mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

Major refactoring of Browsertrix Crawler to native capture network traffic to WARC files via the Chrome Debug Protocol (CDP). Allows for more flexibility and accuracy when dealing with HTTP/2.x sites and avoids a MITM proxy. Addresses #343 Changes include: - Recorder class for capture CDP network traffic for each page. - Handling requests from service workers via matching active frames, skipping unrelated requests outside the page (from background pages, etc..) - WARC writing support via TS-based warcio.js library. - Generates single WARC file per worker (still need to add size rollover). - Request interception via Fetch.requestPaused - Rule-based rewriting response support (via wabac.js), using Fetch.getResponseBody() / Fetch.fulfillRequest() - Streaming responses via three methods: inline response fetch via Fetch.takeResponseBodyAsStream, async loading via browser network stack with Network.loadNetworkResource() and node-based async fetch via fetch() - Direct async fetch() capture of non-HTML URLs - Awaiting for all requests to finish before moving on to next page, upto page timeout. - Experimental: generate CDXJ on-the-fly as WARC is being written (not yet in use). - removed pywb, using cdxj-indexer for --generateCDX option.
111 lines
2.6 KiB
JavaScript
111 lines
2.6 KiB
JavaScript
import fs from "fs";
|
|
import path from "path";
|
|
|
|
import { CDXIndexer } from "warcio";
|
|
import { WARCSerializer } from "warcio/node";
|
|
import { logger, errJSON } from "./logger.js";
|
|
|
|
|
|
// =================================================================
|
|
export class WARCWriter
|
|
{
|
|
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails}) {
|
|
this.archivesDir = archivesDir;
|
|
this.tempCdxDir = tempCdxDir;
|
|
this.filename = filename;
|
|
this.gzip = gzip;
|
|
this.logDetails = logDetails;
|
|
|
|
this.offset = 0;
|
|
this.recordLength = 0;
|
|
|
|
if (this.tempCdxDir) {
|
|
this.indexer = new CDXIndexer({format: "cdxj"});
|
|
} else {
|
|
this.indexer = null;
|
|
}
|
|
|
|
this.fh = null;
|
|
this.cdxFH = null;
|
|
}
|
|
|
|
async initFH() {
|
|
if (!this.fh) {
|
|
this.fh = fs.createWriteStream(path.join(this.archivesDir, this.filename));
|
|
}
|
|
if (!this.cdxFH && this.tempCdxDir) {
|
|
this.cdxFH = fs.createWriteStream(path.join(this.tempCdxDir, this.filename + ".cdx"));
|
|
}
|
|
}
|
|
|
|
async writeRecordPair(responseRecord, requestRecord, responseSerializer = null) {
|
|
const opts = {gzip: this.gzip};
|
|
|
|
if (!responseSerializer) {
|
|
responseSerializer = new WARCSerializer(responseRecord, opts);
|
|
}
|
|
|
|
await this.initFH();
|
|
|
|
this.recordLength = await this._writeRecord(responseRecord, responseSerializer);
|
|
|
|
this._writeCDX(responseRecord);
|
|
|
|
const requestSerializer = new WARCSerializer(requestRecord, opts);
|
|
this.recordLength = await this._writeRecord(requestRecord, requestSerializer);
|
|
|
|
this._writeCDX(requestRecord);
|
|
|
|
}
|
|
|
|
async _writeRecord(record, serializer) {
|
|
let total = 0;
|
|
const url = record.warcTargetURI;
|
|
|
|
for await (const chunk of serializer) {
|
|
total += chunk.length;
|
|
try {
|
|
this.fh.write(chunk);
|
|
} catch (e) {
|
|
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
|
|
}
|
|
}
|
|
|
|
return total;
|
|
}
|
|
|
|
_writeCDX(record) {
|
|
if (this.indexer) {
|
|
const cdx = this.indexer.indexRecord(record, this, this.filename);
|
|
|
|
if (this.indexer && this.cdxFH && cdx) {
|
|
this.indexer.write(cdx, this.cdxFH);
|
|
}
|
|
}
|
|
|
|
this.offset += this.recordLength;
|
|
}
|
|
|
|
async flush() {
|
|
if (this.fh) {
|
|
await streamFinish(this.fh);
|
|
this.fh = null;
|
|
}
|
|
|
|
if (this.cdxFH) {
|
|
this._writeCDX(null);
|
|
|
|
await streamFinish(this.cdxFH);
|
|
this.cdxFH = null;
|
|
}
|
|
}
|
|
}
|
|
|
|
// =================================================================
|
|
export function streamFinish(fh) {
|
|
const p = new Promise(resolve => {
|
|
fh.once("finish", () => resolve());
|
|
});
|
|
fh.end();
|
|
return p;
|
|
}
|