browsertrix-crawler/src/util/wacz.ts

import path, { basename } from "node:path";
import fs from "node:fs";
import fsp from "node:fs/promises";
import { Writable, Readable } from "node:stream";
import { pipeline } from "node:stream/promises";
import readline from "node:readline";
import child_process from "node:child_process";

import { createHash, Hash } from "node:crypto";

import { gzip } from "node:zlib";

import { ReadableStream } from "node:stream/web";

import { makeZip, InputWithoutMeta } from "client-zip";
import { logger, formatErr } from "./logger.js";
import { streamFinish } from "./warcwriter.js";
import { getDirSize } from "./storage.js";

const DATAPACKAGE_JSON = "datapackage.json";
const DATAPACKAGE_DIGEST_JSON = "datapackage-digest.json";

const INDEX_CDXJ = "index.cdxj";
const INDEX_IDX = "index.idx";
const INDEX_CDX_GZ = "index.cdx.gz";

const LINES_PER_BLOCK = 256;

const ZIP_CDX_MIN_SIZE = 50_000;

// ============================================================================
export type WACZInitOpts = {
  input: string[];
  output: string;
  pages: string;
  warcCdxDir: string;
  indexesDir: string;
  logDirectory: string;

  softwareString: string;

  signingUrl?: string;
  signingToken?: string;
  title?: string;
  description?: string;
};

export type WACZResourceEntry = {
  name: string;
  path: string;
  hash: string;
  bytes: number;
};

export type WACZDataPackage = {
  resources: WACZResourceEntry[];
  created: string;
  wacz_version: string;
  software: string;
  title?: string;
  description?: string;
};

type WACZDigest = {
  path: string;
  hash: string;
  signedData?: string;
};

class CurrZipFileMarker extends Uint8Array {
  // empty array to mark start of WACZ file, also track metadata per-file
  filename: string;
  zipPath: string;
  size: number;
  hasher: Hash;

  constructor(filename: string, zipPath: string, size: number) {
    super();
    this.filename = filename;
    this.zipPath = zipPath;
    this.size = size;
    this.hasher = createHash("sha256");
  }
}

class EndOfZipFileMarker extends Uint8Array {
  // empty array to mark end of WACZ file
}

// ============================================================================
export class WACZ {
  collDir: string;

  warcs: string[];

  pagesDir: string;
  logsDir: string;
  warcCdxDir: string;
  indexesDir: string;

  datapackage: WACZDataPackage;

  signingUrl: string | null;
  signingToken: string | null;

  private size = 0;
  private hash: string = "";

  constructor(config: WACZInitOpts, collDir: string) {
    this.warcs = config.input;
    this.pagesDir = config.pages;
    this.logsDir = config.logDirectory;
    this.warcCdxDir = config.warcCdxDir;
    this.collDir = collDir;
    this.indexesDir = config.indexesDir;

    this.datapackage = {
      resources: [],
      // drop microseconds
      created: new Date().toISOString().split(".", 1)[0] + "Z",
      wacz_version: "1.1.1",
      software: config.softwareString,
    };

    if (config.title) {
      this.datapackage.title = config.title;
    }
    if (config.description) {
      this.datapackage.description = config.description;
    }

    this.signingUrl = config.signingUrl || null;
    this.signingToken = config.signingToken || null;
  }

  generate(): Readable {
    const files = [
      ...this.warcs,
      ...addDirFiles(this.indexesDir),
      ...addDirFiles(this.pagesDir),
      ...addDirFiles(this.logsDir),
    ];

    const zip = makeZip(
      this.iterDirForZip(files),
    ) as ReadableStream<Uint8Array>;

    const hasher = createHash("sha256");
    const resources = this.datapackage.resources;

    let size = 0;

    async function* iterWACZ(wacz: WACZ): AsyncIterable<Uint8Array> {
      let currFile: CurrZipFileMarker | null = null;

      for await (const chunk of zip) {
        if (chunk instanceof CurrZipFileMarker) {
          currFile = chunk;
        } else if (chunk instanceof EndOfZipFileMarker) {
          if (currFile) {
            // Frictionless data validation requires this to be lowercase
            const name = basename(currFile.filename).toLowerCase();
            const path = currFile.zipPath;
            const bytes = currFile.size;
            const hash = "sha256:" + currFile.hasher.digest("hex");
            resources.push({ name, path, bytes, hash });
            logger.debug("Added file to WACZ", { path, bytes, hash }, "wacz");
          }
          currFile = null;
        } else {
          yield chunk;
          if (currFile) {
            currFile.hasher.update(chunk);
          }
          hasher.update(chunk);
          size += chunk.length;
        }
      }

      wacz.hash = hasher.digest("hex");
      wacz.size = size;
    }

    return Readable.from(iterWACZ(this));
  }

  getHash() {
    return this.hash;
  }

  getSize() {
    return this.size;
  }

  async generateToFile(filename: string) {
    await pipeline(this.generate(), fs.createWriteStream(filename));
  }

  async *iterDirForZip(files: string[]): AsyncGenerator<InputWithoutMeta> {
    const encoder = new TextEncoder();
    const end = new EndOfZipFileMarker();

    async function* wrapMarkers(
      start: CurrZipFileMarker,
      iter: AsyncIterable<Uint8Array>,
    ) {
      yield start;
      yield* iter;
      yield end;
    }

    async function* getData(data: Uint8Array) {
      yield data;
    }

    for (const filename of files) {
      const input = fs.createReadStream(filename);

      const stat = await fsp.stat(filename);
      const lastModified = stat.mtime;
      const size = stat.size;

      const nameStr = filename.slice(this.collDir.length + 1);
      const name = encoder.encode(nameStr);

      const currFile = new CurrZipFileMarker(filename, nameStr, size);

      yield { input: wrapMarkers(currFile, input), lastModified, name, size };
    }

    // datapackage.json

    const datapackageData = encoder.encode(
      JSON.stringify(this.datapackage, null, 2),
    );

    yield {
      input: getData(datapackageData),
      lastModified: new Date(),
      name: DATAPACKAGE_JSON,
      size: datapackageData.length,
    };

    const hash =
      "sha256:" + createHash("sha256").update(datapackageData).digest("hex");

    // datapackage-digest.json

    const digest: WACZDigest = {
      path: DATAPACKAGE_JSON,
      hash,
    };

    // Get Signature
    if (this.signingUrl) {
      const body = JSON.stringify({
        hash,
        created: this.datapackage.created,
      });

      const headers: Record<string, string> = {
        "Content-Type": "application/json",
      };

      if (this.signingToken) {
        headers["Authorization"] = this.signingToken;
      }

      try {
        const response = await fetch(this.signingUrl, {
          method: "POST",
          headers,
          body,
        });
        digest.signedData = await response.json();
      } catch (e) {
        logger.warn(
          "Failed to sign WACZ, continuing w/o signature",
          { ...formatErr(e) },
          "wacz",
        );
      }
    }

    const digestData = encoder.encode(JSON.stringify(digest, null, 2));

    yield {
      input: getData(digestData),
      lastModified: new Date(),
      name: DATAPACKAGE_DIGEST_JSON,
      size: digestData.length,
    };
  }
}

// Merge CDX
export function addDirFiles(fullDir: string): string[] {
  const files = fs.readdirSync(fullDir);
  return files.map((name) => path.join(fullDir, name));
}

export async function mergeCDXJ(
  warcCdxDir: string,
  indexesDir: string,
  zipped: boolean | null = null,
) {
  async function* readLinesFrom(stdout: Readable): AsyncGenerator<string> {
    for await (const line of readline.createInterface({ input: stdout })) {
      yield line + "\n";
    }
  }

  async function* generateCompressed(
    reader: AsyncGenerator<string>,
    idxFile: Writable,
  ) {
    let offset = 0;

    const encoder = new TextEncoder();

    const filename = INDEX_CDX_GZ;

    let cdxLines: string[] = [];

    let key = "";
    let count = 0;

    idxFile.write(
      `!meta 0 ${JSON.stringify({
        format: "cdxj-gzip-1.0",
        filename: INDEX_CDX_GZ,
      })}\n`,
    );

    const finishChunk = async () => {
      const compressed = await new Promise<Uint8Array>((resolve) => {
        gzip(encoder.encode(cdxLines.join("")), (_, result) => {
          if (result) {
            resolve(result);
          }
        });
      });

      const length = compressed.length;
      const digest =
        "sha256:" + createHash("sha256").update(compressed).digest("hex");

      const idx =
        key + " " + JSON.stringify({ offset, length, digest, filename });

      idxFile.write(idx + "\n");

      offset += length;

      count = 1;
      key = "";
      cdxLines = [];

      return compressed;
    };

    for await (const cdx of reader) {
      if (!key) {
        key = cdx.split(" {", 1)[0];
      }

      if (++count === LINES_PER_BLOCK) {
        yield await finishChunk();
      }
      cdxLines.push(cdx);
    }

    if (key) {
      yield await finishChunk();
    }
  }

  await fsp.mkdir(indexesDir, { recursive: true });

  const removeIndexFile = async (filename: string) => {
    try {
      await fsp.unlink(path.join(indexesDir, filename));
    } catch (e) {
      // ignore
    }
  };

  const cdxFiles = addDirFiles(warcCdxDir);

  if (!cdxFiles.length) {
    logger.info("No CDXJ files to merge");
    return;
  }

  if (zipped === null) {
    const tempCdxSize = await getDirSize(warcCdxDir);

    // if CDX size is at least this size, use compressed version
    zipped = tempCdxSize >= ZIP_CDX_MIN_SIZE;
  }

  const proc = child_process.spawn("sort", cdxFiles, {
    env: { LC_ALL: "C" },
  });

  if (!zipped) {
    const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDXJ));

    await pipeline(Readable.from(readLinesFrom(proc.stdout)), output);

    await removeIndexFile(INDEX_IDX);
    await removeIndexFile(INDEX_CDX_GZ);
  } else {
    const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDX_GZ));

    const outputIdx = fs.createWriteStream(path.join(indexesDir, INDEX_IDX), {
      encoding: "utf-8",
    });

    await pipeline(
      Readable.from(generateCompressed(readLinesFrom(proc.stdout), outputIdx)),
      output,
    );

    await streamFinish(outputIdx);

    await removeIndexFile(INDEX_CDXJ);
  }
}