From 089d901b9b17af59dcb2bc58bad2ef55deeecc16 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 22 May 2024 15:47:05 -0700 Subject: [PATCH] Always add warcinfo records to all WARCs (#556) Fixes #553 Includes `warcinfo` records at the beginning of new WARCs, as well as the combined WARC. Makes the warcinfo record also WARC/1.1 to match the rest of the WARC records. --- src/crawler.ts | 27 +++------------------------ src/util/warcwriter.ts | 31 +++++++++++++++++++++++++++++++ tests/warcinfo.test.js | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 25 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index fa23aa49..9399227f 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -16,8 +16,6 @@ import { parseArgs } from "./util/argParser.js"; import yaml from "js-yaml"; -import * as warcio from "warcio"; - import { HealthChecker } from "./util/healthcheck.js"; import { TextExtractViaSnapshot } from "./util/textextract.js"; import { @@ -60,7 +58,7 @@ import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; -import { WARCWriter } from "./util/warcwriter.js"; +import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; const HTTPS_AGENT = new HTTPSAgent({ rejectUnauthorized: false, @@ -458,6 +456,7 @@ export class Crawler { logger.setExternalLogStream(this.logFH); this.infoString = await getInfoString(); + setWARCInfo(this.infoString, this.params.warcInfo); logger.info(this.infoString); logger.info("Seeds", this.params.scopedSeeds); @@ -1104,26 +1103,6 @@ self.__bx_behaviors.selectMainBehavior(); return res ? frame : null; } - async createWARCInfo(filename: string) { - const warcVersion = "WARC/1.1"; - const type = "warcinfo"; - - const info = { - software: this.infoString, - format: "WARC File Format 1.1", - }; - - const warcInfo = { ...info, ...this.params.warcInfo }; - const record = await warcio.WARCRecord.createWARCInfo( - { filename, type, warcVersion }, - warcInfo, - ); - const buffer = await warcio.WARCSerializer.serialize(record, { - gzip: true, - }); - return buffer; - } - async checkLimits() { let interrupt = false; @@ -2412,7 +2391,7 @@ self.__bx_behaviors.selectMainBehavior(); generatedCombinedWarcs.push(combinedWarcName); - const warcBuffer = await this.createWARCInfo(combinedWarcName); + const warcBuffer = await createWARCInfo(combinedWarcName); fh.write(warcBuffer); } diff --git a/src/util/warcwriter.ts b/src/util/warcwriter.ts index 107b30ce..2a14552f 100644 --- a/src/util/warcwriter.ts +++ b/src/util/warcwriter.ts @@ -11,6 +11,8 @@ import PQueue from "p-queue"; const DEFAULT_ROLLOVER_SIZE = 1_000_000_000; +let warcInfo = {}; + export type ResourceRecordData = { buffer: Uint8Array; resourceType: string; @@ -117,6 +119,8 @@ export class WARCWriter implements IndexerOffsetLength { ); } + fh.write(await createWARCInfo(this.filename)); + return fh; } @@ -310,6 +314,33 @@ export class WARCWriter implements IndexerOffsetLength { } } +// ================================================================= +export function setWARCInfo( + software: string, + otherParams?: Record, +) { + warcInfo = { + software, + format: "WARC File Format 1.1", + ...otherParams, + }; +} + +// ================================================================= +export async function createWARCInfo(filename: string) { + const warcVersion = "WARC/1.1"; + const type = "warcinfo"; + + const record = await WARCRecord.createWARCInfo( + { filename, type, warcVersion }, + warcInfo, + ); + const buffer = await WARCSerializer.serialize(record, { + gzip: true, + }); + return buffer; +} + // ================================================================= export function streamFinish(fh: Writable) { const p = new Promise((resolve) => { diff --git a/tests/warcinfo.test.js b/tests/warcinfo.test.js index 529a8379..da24c448 100644 --- a/tests/warcinfo.test.js +++ b/tests/warcinfo.test.js @@ -1,8 +1,11 @@ import fs from "fs"; import zlib from "zlib"; +import path from "path"; import child_process from "child_process"; -test("check that the warcinfo file works as expected on the command line", async () => { +test("run crawl", async() => { + let success = false; + try { const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); const proc = child_process.execSync( @@ -11,10 +14,42 @@ test("check that the warcinfo file works as expected on the command line", async ); console.log(proc); + success = true; } catch (error) { console.log(error); } + expect(success).toBe(true); +}); + +test("check that the warcinfo for individual WARC is as expected", async () => { + + const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/"); + + let filename = ""; + + for (const name of warcs) { + if (name.startsWith("rec-")) { + filename = path.join("test-crawls/collections/warcinfo/archive/", name); + break; + } + } + + const warcData = fs.readFileSync(filename); + + const data = zlib.gunzipSync(warcData); + + const string = data.toString("utf8"); + + expect(string.indexOf("operator: test")).toBeGreaterThan(-1); + expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); + expect( + string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/), + ).not.toEqual(null); + expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1); +}); + +test("check that the warcinfo for combined WARC file is as expected", async () => { const warcData = fs.readFileSync( "test-crawls/collections/warcinfo/warcinfo_0.warc.gz", );