Streaming in-place WACZ creation + CDXJ indexing (#673)

Fixes #674 

This PR supersedes #505, and instead of using js-wacz for optimized WACZ
creation:
- generates an 'in-place' or 'streaming' WACZ in the crawler, without
having to copy the data again.
- WACZ contents are streamed to remote upload (or to disk) from existing
files on disk
- CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX)
- All data in the WARCs is written and read only once
- Should result in significant speed / disk usage improvements:
previously WARC was written once, then read again (for CDXJ indexing),
read again (for adding to new WACZ ZIP), written to disk (into new WACZ
ZIP), read again (if upload to remote endpoint). Now, WARCs are written
once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all
data is read once to either generate WACZ on disk or upload to remote.

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-08-29 13:21:20 -07:00 committed by GitHub
parent 8934feaf70
commit 85a07aff18
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 650 additions and 203 deletions

View file

@ -60,8 +60,11 @@ jobs:
- name: add http-server for tests
run: yarn add -D http-server
- name: install py-wacz as root for tests
run: sudo pip install wacz
- name: run all tests as root
run: sudo DOCKER_HOST_NAME=172.17.0.1 yarn test
run: sudo DOCKER_HOST_NAME=172.17.0.1 yarn test -validate
- name: run saved state + qa compare test as non-root - with volume owned by current user
run: |

View file

@ -17,13 +17,6 @@ EXPOSE 9222 9223 6080
WORKDIR /app
ADD requirements.txt /app/
RUN python3 -m venv /app/python-venv && \
/app/python-venv/bin/pip install -U setuptools && \
/app/python-venv/bin/pip install -r requirements.txt && \
ln -s /app/python-venv/bin/wacz /usr/bin/wacz && \
ln -s /app/python-venv/bin/cdxj-indexer /usr/bin/cdxj-indexer
ADD package.json yarn.lock /app/
# to allow forcing rebuilds from this stage

View file

@ -17,9 +17,9 @@
},
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.19.7",
"@webrecorder/wabac": "^2.19.8",
"browsertrix-behaviors": "^0.6.4",
"client-zip": "^2.4.5",
"fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",
@ -36,7 +36,7 @@
"tsc": "^2.0.4",
"undici": "^6.18.2",
"uuid": "8.3.2",
"warcio": "^2.2.1",
"warcio": "^2.3.0",
"ws": "^7.4.4",
"yargs": "^17.7.2"
},
@ -46,6 +46,7 @@
"@types/node": "^20.8.7",
"@types/pixelmatch": "^5.2.6",
"@types/pngjs": "^6.0.4",
"@types/sax": "^1.2.7",
"@types/uuid": "^9.0.6",
"@types/ws": "^8.5.8",
"@typescript-eslint/eslint-plugin": "^6.10.0",
@ -62,5 +63,8 @@
"jest": {
"transform": {},
"testTimeout": 90000
},
"resolutions": {
"wrap-ansi": "7.0.0"
}
}

View file

@ -16,6 +16,8 @@ import { parseArgs } from "./util/argParser.js";
import yaml from "js-yaml";
import { WACZ, WACZInitOpts, mergeCDXJ } from "./util/wacz.js";
import { HealthChecker } from "./util/healthcheck.js";
import { TextExtractViaSnapshot } from "./util/textextract.js";
import {
@ -62,7 +64,12 @@ import {
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
import {
WARCWriter,
createWARCInfo,
setWARCInfo,
streamFinish,
} from "./util/warcwriter.js";
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
import { initProxy } from "./util/proxy.js";
@ -117,7 +124,7 @@ export class Crawler {
pagesFH?: WriteStream | null = null;
extraPagesFH?: WriteStream | null = null;
logFH!: WriteStream;
logFH: WriteStream | null = null;
crawlId: string;
@ -150,7 +157,8 @@ export class Crawler {
archivesDir: string;
tempdir: string;
tempCdxDir: string;
warcCdxDir: string;
indexesDir: string;
screenshotWriter: WARCWriter | null;
textWriter: WARCWriter | null;
@ -288,7 +296,10 @@ export class Crawler {
// archives dir
this.archivesDir = path.join(this.collDir, "archive");
this.tempdir = path.join(os.tmpdir(), "tmp-dl");
this.tempCdxDir = path.join(this.collDir, "tmp-cdx");
// indexes dirs
this.warcCdxDir = path.join(this.collDir, "warc-cdx");
this.indexesDir = path.join(this.collDir, "indexes");
this.screenshotWriter = null;
this.textWriter = null;
@ -470,7 +481,7 @@ export class Crawler {
if (!this.params.dryRun) {
await fsp.mkdir(this.archivesDir, { recursive: true });
await fsp.mkdir(this.tempdir, { recursive: true });
await fsp.mkdir(this.tempCdxDir, { recursive: true });
await fsp.mkdir(this.warcCdxDir, { recursive: true });
}
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
@ -1478,36 +1489,24 @@ self.__bx_behaviors.selectMainBehavior();
await this.combineWARC();
}
if (this.params.generateCDX && !this.params.dryRun) {
logger.info("Generating CDX");
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
await this.crawlState.setStatus("generate-cdx");
const warcList = await fsp.readdir(this.archivesDir);
const warcListFull = warcList.map((filename) =>
path.join(this.archivesDir, filename),
);
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
const params = [
"-o",
path.join(this.collDir, "indexes", "index.cdxj"),
...warcListFull,
];
const indexResult = await this.awaitProcess(
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
);
if (indexResult === 0) {
logger.debug("Indexing complete, CDX successfully created");
} else {
logger.error("Error indexing and generating CDX", {
"status code": indexResult,
});
}
}
logger.info("Crawling done");
if (
(this.params.generateCDX || this.params.generateWACZ) &&
!this.params.dryRun
) {
logger.info("Merging CDX");
await this.crawlState.setStatus(
this.params.generateWACZ ? "generate-wacz" : "generate-cdx",
);
await mergeCDXJ(
this.warcCdxDir,
this.indexesDir,
this.params.generateWACZ ? null : false,
);
}
if (
this.params.generateWACZ &&
!this.params.dryRun &&
@ -1543,11 +1542,9 @@ self.__bx_behaviors.selectMainBehavior();
if (!this.logFH) {
return;
}
try {
await new Promise<void>((resolve) => this.logFH.close(() => resolve()));
} catch (e) {
// ignore
}
const logFH = this.logFH;
this.logFH = null;
await streamFinish(logFH);
}
async generateWACZ() {
@ -1577,110 +1574,67 @@ self.__bx_behaviors.selectMainBehavior();
logger.fatal("No WARC Files, assuming crawl failed");
}
logger.debug("End of log file, storing logs in WACZ");
const waczPath = path.join(this.collDir, this.params.collection + ".wacz");
// Build the argument list to pass to the wacz create command
const waczFilename = this.params.collection.concat(".wacz");
const waczPath = path.join(this.collDir, waczFilename);
const streaming = !!this.storage;
const createArgs = [
"create",
"-o",
waczPath,
"--pages",
this.seedPagesFile,
"--extra-pages",
this.otherPagesFile,
"--copy-pages",
"--log-directory",
this.logDir,
];
if (!streaming) {
logger.debug("WACZ will be written to disk", { path: waczPath }, "wacz");
} else {
logger.debug("WACZ will be stream uploaded to remote storage");
}
logger.debug("End of log file in WACZ, storing logs to WACZ file");
await this.closeLog();
const waczOpts: WACZInitOpts = {
input: warcFileList.map((x) => path.join(this.archivesDir, x)),
output: waczPath,
pages: this.pagesDir,
logDirectory: this.logDir,
warcCdxDir: this.warcCdxDir,
indexesDir: this.indexesDir,
softwareString: this.infoString,
};
if (process.env.WACZ_SIGN_URL) {
createArgs.push("--signing-url");
createArgs.push(process.env.WACZ_SIGN_URL);
waczOpts.signingUrl = process.env.WACZ_SIGN_URL;
if (process.env.WACZ_SIGN_TOKEN) {
createArgs.push("--signing-token");
createArgs.push(process.env.WACZ_SIGN_TOKEN);
waczOpts.signingToken = "bearer " + process.env.WACZ_SIGN_TOKEN;
}
}
if (this.params.title) {
createArgs.push("--title");
createArgs.push(this.params.title);
waczOpts.title = this.params.title;
}
if (this.params.description) {
createArgs.push("--desc");
createArgs.push(this.params.description);
waczOpts.description = this.params.description;
}
createArgs.push("-f");
warcFileList.forEach((val) =>
createArgs.push(path.join(this.archivesDir, val)),
);
// create WACZ
const waczResult = await this.awaitProcess(
child_process.spawn("wacz", createArgs, { detached: RUN_DETACHED }),
);
if (waczResult !== 0) {
logger.error("Error creating WACZ", { "status code": waczResult });
logger.fatal("Unable to write WACZ successfully");
try {
const wacz = new WACZ(waczOpts, this.collDir);
if (!streaming) {
await wacz.generateToFile(waczPath);
}
logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
// Verify WACZ
/*
const validateArgs = ["validate"];
validateArgs.push("-f");
validateArgs.push(waczPath);
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));
if (waczVerifyResult !== 0) {
console.log("validate", waczVerifyResult);
logger.fatal("Unable to verify WACZ created successfully");
}
*/
if (this.storage) {
await this.crawlState.setStatus("uploading-wacz");
const filename = process.env.STORE_FILENAME || "@ts-@id.wacz";
const targetFilename = interpolateFilename(filename, this.crawlId);
await this.storage.uploadCollWACZ(waczPath, targetFilename, isFinished);
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
return true;
}
return false;
} catch (e) {
logger.error("Error creating WACZ", e);
if (!streaming) {
logger.fatal("Unable to write WACZ successfully");
}
awaitProcess(proc: ChildProcess) {
const stdout: string[] = [];
const stderr: string[] = [];
proc.stdout!.on("data", (data) => {
stdout.push(data.toString());
});
proc.stderr!.on("data", (data) => {
stderr.push(data.toString());
});
return new Promise((resolve) => {
proc.on("close", (code) => {
if (stdout.length) {
logger.debug(stdout.join("\n"));
}
if (stderr.length && this.params.logging.includes("debug")) {
logger.debug(stderr.join("\n"));
}
resolve(code);
});
});
}
logMemory() {
@ -2604,7 +2558,7 @@ self.__bx_behaviors.selectMainBehavior();
return new WARCWriter({
archivesDir: this.archivesDir,
tempCdxDir: this.tempCdxDir,
warcCdxDir: this.warcCdxDir,
filenameTemplate,
rolloverSize: this.params.rolloverSize,
gzip,

View file

@ -201,7 +201,7 @@ class ArgParser {
generateWACZ: {
alias: ["generatewacz", "generateWacz"],
describe: "If set, generate wacz",
describe: "If set, generate WACZ on disk",
type: "boolean",
default: false,
},

View file

@ -51,6 +51,7 @@ export const LOG_CONTEXT_TYPES = [
"crawlStatus",
"links",
"sitemap",
"wacz",
"replay",
"proxy",
] as const;

View file

@ -14,6 +14,8 @@ import { logger } from "./logger.js";
// @ts-expect-error (incorrect types on get-folder-size)
import getFolderSize from "get-folder-size";
import { WACZ } from "./wacz.js";
const DEFAULT_REGION = "us-east-1";
// ===========================================================================
@ -81,6 +83,32 @@ export class S3StorageSync {
this.webhookUrl = webhookUrl;
}
async uploadStreamingWACZ(wacz: WACZ, targetFilename: string) {
const fileUploadInfo = {
bucket: this.bucketName,
crawlId: this.crawlId,
prefix: this.objectPrefix,
targetFilename,
};
logger.info("S3 file upload information", fileUploadInfo, "storage");
const waczStream = wacz.generate();
await this.client.putObject(
this.bucketName,
this.objectPrefix + targetFilename,
waczStream,
);
const hash = wacz.getHash();
const path = targetFilename;
const size = wacz.getSize();
// for backwards compatibility, keep 'bytes'
return { path, size, hash, bytes: size };
}
async uploadFile(srcFilename: string, targetFilename: string) {
const fileUploadInfo = {
bucket: this.bucketName,
@ -114,11 +142,15 @@ export class S3StorageSync {
}
async uploadCollWACZ(
srcFilename: string,
srcOrWACZ: string | WACZ,
targetFilename: string,
completed = true,
) {
const resource = await this.uploadFile(srcFilename, targetFilename);
const resource =
typeof srcOrWACZ === "string"
? await this.uploadFile(srcOrWACZ, targetFilename)
: await this.uploadStreamingWACZ(srcOrWACZ, targetFilename);
logger.info(
"WACZ S3 file upload resource",
{ targetFilename, resource },
@ -191,7 +223,7 @@ export async function getFileSize(filename: string) {
return stats.size;
}
export async function getDirSize(dir: string) {
export async function getDirSize(dir: string): Promise<number> {
const { size, errors } = await getFolderSize(dir);
if (errors && errors.length) {
logger.warn("Size check errors", { errors }, "storage");
@ -234,11 +266,16 @@ export async function checkDiskUtilization(
const kbTotal = parseInt(diskUsage["1K-blocks"]);
let kbArchiveDirSize = Math.round(archiveDirSize / 1024);
// assume if has STORE_ENDPOINT_URL, will be uploading to remote
// and not storing local copy of either WACZ or WARC
if (!process.env.STORE_ENDPOINT_URL) {
if (params.combineWARC && params.generateWACZ) {
kbArchiveDirSize *= 4;
} else if (params.combineWARC || params.generateWACZ) {
kbArchiveDirSize *= 2;
}
}
const projectedTotal = kbUsed + kbArchiveDirSize;
const projectedUsedPercentage = calculatePercentageUsed(

429
src/util/wacz.ts Normal file
View file

@ -0,0 +1,429 @@
import path, { basename } from "node:path";
import fs from "node:fs";
import fsp from "node:fs/promises";
import { Writable, Readable } from "node:stream";
import { pipeline } from "node:stream/promises";
import readline from "node:readline";
import child_process from "node:child_process";
import { createHash, Hash } from "node:crypto";
import { gzip } from "node:zlib";
import { ReadableStream } from "node:stream/web";
import { makeZip, InputWithoutMeta } from "client-zip";
import { logger, formatErr } from "./logger.js";
import { streamFinish } from "./warcwriter.js";
import { getDirSize } from "./storage.js";
const DATAPACKAGE_JSON = "datapackage.json";
const DATAPACKAGE_DIGEST_JSON = "datapackage-digest.json";
const INDEX_CDXJ = "index.cdxj";
const INDEX_IDX = "index.idx";
const INDEX_CDX_GZ = "index.cdx.gz";
const LINES_PER_BLOCK = 256;
const ZIP_CDX_MIN_SIZE = 50_000;
// ============================================================================
export type WACZInitOpts = {
input: string[];
output: string;
pages: string;
warcCdxDir: string;
indexesDir: string;
logDirectory: string;
softwareString: string;
signingUrl?: string;
signingToken?: string;
title?: string;
description?: string;
};
export type WACZResourceEntry = {
name: string;
path: string;
hash: string;
bytes: number;
};
export type WACZDataPackage = {
resources: WACZResourceEntry[];
created: string;
wacz_version: string;
software: string;
title?: string;
description?: string;
};
type WACZDigest = {
path: string;
hash: string;
signedData?: string;
};
class CurrZipFileMarker extends Uint8Array {
// empty array to mark start of WACZ file, also track metadata per-file
filename: string;
zipPath: string;
size: number;
hasher: Hash;
constructor(filename: string, zipPath: string, size: number) {
super();
this.filename = filename;
this.zipPath = zipPath;
this.size = size;
this.hasher = createHash("sha256");
}
}
class EndOfZipFileMarker extends Uint8Array {
// empty array to mark end of WACZ file
}
// ============================================================================
export class WACZ {
collDir: string;
warcs: string[];
pagesDir: string;
logsDir: string;
warcCdxDir: string;
indexesDir: string;
datapackage: WACZDataPackage;
signingUrl: string | null;
signingToken: string | null;
private size = 0;
private hash: string = "";
constructor(config: WACZInitOpts, collDir: string) {
this.warcs = config.input;
this.pagesDir = config.pages;
this.logsDir = config.logDirectory;
this.warcCdxDir = config.warcCdxDir;
this.collDir = collDir;
this.indexesDir = config.indexesDir;
this.datapackage = {
resources: [],
// drop microseconds
created: new Date().toISOString().split(".", 1)[0] + "Z",
wacz_version: "1.1.1",
software: config.softwareString,
};
if (config.title) {
this.datapackage.title = config.title;
}
if (config.description) {
this.datapackage.description = config.description;
}
this.signingUrl = config.signingUrl || null;
this.signingToken = config.signingToken || null;
}
generate(): Readable {
const files = [
...this.warcs,
...addDirFiles(this.indexesDir),
...addDirFiles(this.pagesDir),
...addDirFiles(this.logsDir),
];
const zip = makeZip(
this.iterDirForZip(files),
) as ReadableStream<Uint8Array>;
const hasher = createHash("sha256");
const resources = this.datapackage.resources;
let size = 0;
async function* iterWACZ(wacz: WACZ): AsyncIterable<Uint8Array> {
let currFile: CurrZipFileMarker | null = null;
for await (const chunk of zip) {
if (chunk instanceof CurrZipFileMarker) {
currFile = chunk;
} else if (chunk instanceof EndOfZipFileMarker) {
if (currFile) {
// Frictionless data validation requires this to be lowercase
const name = basename(currFile.filename).toLowerCase();
const path = currFile.zipPath;
const bytes = currFile.size;
const hash = "sha256:" + currFile.hasher.digest("hex");
resources.push({ name, path, bytes, hash });
logger.debug("Added file to WACZ", { path, bytes, hash }, "wacz");
}
currFile = null;
} else {
yield chunk;
if (currFile) {
currFile.hasher.update(chunk);
}
hasher.update(chunk);
size += chunk.length;
}
}
wacz.hash = hasher.digest("hex");
wacz.size = size;
}
return Readable.from(iterWACZ(this));
}
getHash() {
return this.hash;
}
getSize() {
return this.size;
}
async generateToFile(filename: string) {
await pipeline(this.generate(), fs.createWriteStream(filename));
}
async *iterDirForZip(files: string[]): AsyncGenerator<InputWithoutMeta> {
const encoder = new TextEncoder();
const end = new EndOfZipFileMarker();
async function* wrapMarkers(
start: CurrZipFileMarker,
iter: AsyncIterable<Uint8Array>,
) {
yield start;
yield* iter;
yield end;
}
async function* getData(data: Uint8Array) {
yield data;
}
for (const filename of files) {
const input = fs.createReadStream(filename);
const stat = await fsp.stat(filename);
const lastModified = stat.mtime;
const size = stat.size;
const nameStr = filename.slice(this.collDir.length + 1);
const name = encoder.encode(nameStr);
const currFile = new CurrZipFileMarker(filename, nameStr, size);
yield { input: wrapMarkers(currFile, input), lastModified, name, size };
}
// datapackage.json
const datapackageData = encoder.encode(
JSON.stringify(this.datapackage, null, 2),
);
yield {
input: getData(datapackageData),
lastModified: new Date(),
name: DATAPACKAGE_JSON,
size: datapackageData.length,
};
const hash =
"sha256:" + createHash("sha256").update(datapackageData).digest("hex");
// datapackage-digest.json
const digest: WACZDigest = {
path: DATAPACKAGE_JSON,
hash,
};
// Get Signature
if (this.signingUrl) {
const body = JSON.stringify({
hash,
created: this.datapackage.created,
});
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.signingToken) {
headers["Authorization"] = this.signingToken;
}
try {
const response = await fetch(this.signingUrl, {
method: "POST",
headers,
body,
});
digest.signedData = await response.json();
} catch (e) {
logger.warn(
"Failed to sign WACZ, continuing w/o signature",
{ ...formatErr(e) },
"wacz",
);
}
}
const digestData = encoder.encode(JSON.stringify(digest, null, 2));
yield {
input: getData(digestData),
lastModified: new Date(),
name: DATAPACKAGE_DIGEST_JSON,
size: digestData.length,
};
}
}
// Merge CDX
export function addDirFiles(fullDir: string): string[] {
const files = fs.readdirSync(fullDir);
return files.map((name) => path.join(fullDir, name));
}
export async function mergeCDXJ(
warcCdxDir: string,
indexesDir: string,
zipped: boolean | null = null,
) {
async function* readLinesFrom(stdout: Readable): AsyncGenerator<string> {
for await (const line of readline.createInterface({ input: stdout })) {
yield line + "\n";
}
}
async function* generateCompressed(
reader: AsyncGenerator<string>,
idxFile: Writable,
) {
let offset = 0;
const encoder = new TextEncoder();
const filename = INDEX_CDX_GZ;
let cdxLines: string[] = [];
let key = "";
let count = 0;
idxFile.write(
`!meta 0 ${JSON.stringify({
format: "cdxj-gzip-1.0",
filename: INDEX_CDX_GZ,
})}\n`,
);
const finishChunk = async () => {
const compressed = await new Promise<Uint8Array>((resolve) => {
gzip(encoder.encode(cdxLines.join("")), (_, result) => {
if (result) {
resolve(result);
}
});
});
const length = compressed.length;
const digest =
"sha256:" + createHash("sha256").update(compressed).digest("hex");
const idx =
key + " " + JSON.stringify({ offset, length, digest, filename });
idxFile.write(idx + "\n");
offset += length;
count = 1;
key = "";
cdxLines = [];
return compressed;
};
for await (const cdx of reader) {
if (!key) {
key = cdx.split(" {", 1)[0];
}
if (++count === LINES_PER_BLOCK) {
yield await finishChunk();
}
cdxLines.push(cdx);
}
if (key) {
yield await finishChunk();
}
}
await fsp.mkdir(indexesDir, { recursive: true });
const removeIndexFile = async (filename: string) => {
try {
await fsp.unlink(path.join(indexesDir, filename));
} catch (e) {
// ignore
}
};
const cdxFiles = addDirFiles(warcCdxDir);
if (!cdxFiles.length) {
logger.info("No CDXJ files to merge");
return;
}
if (zipped === null) {
const tempCdxSize = await getDirSize(warcCdxDir);
// if CDX size is at least this size, use compressed version
zipped = tempCdxSize >= ZIP_CDX_MIN_SIZE;
}
const proc = child_process.spawn("sort", cdxFiles, {
env: { LC_ALL: "C" },
});
if (!zipped) {
const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDXJ));
await pipeline(Readable.from(readLinesFrom(proc.stdout)), output);
await removeIndexFile(INDEX_IDX);
await removeIndexFile(INDEX_CDX_GZ);
} else {
const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDX_GZ));
const outputIdx = fs.createWriteStream(path.join(indexesDir, INDEX_IDX), {
encoding: "utf-8",
});
await pipeline(
Readable.from(generateCompressed(readLinesFrom(proc.stdout), outputIdx)),
output,
);
await streamFinish(outputIdx);
await removeIndexFile(INDEX_CDXJ);
}
}

View file

@ -24,7 +24,7 @@ export type ResourceRecordData = {
// =================================================================
export class WARCWriter implements IndexerOffsetLength {
archivesDir: string;
tempCdxDir?: string;
warcCdxDir?: string;
filenameTemplate: string;
filename?: string;
gzip: boolean;
@ -45,23 +45,21 @@ export class WARCWriter implements IndexerOffsetLength {
constructor({
archivesDir,
tempCdxDir,
warcCdxDir,
filenameTemplate,
rolloverSize = DEFAULT_ROLLOVER_SIZE,
gzip,
logDetails,
}: {
archivesDir: string;
tempCdxDir?: string;
warcCdxDir?: string;
filenameTemplate: string;
rolloverSize?: number;
gzip: boolean;
logDetails: Record<string, string>;
}) {
this.archivesDir = archivesDir;
this.tempCdxDir = tempCdxDir;
// for now, disabling CDX
this.tempCdxDir = undefined;
this.warcCdxDir = warcCdxDir;
this.logDetails = logDetails;
this.gzip = gzip;
this.rolloverSize = rolloverSize;
@ -77,7 +75,7 @@ export class WARCWriter implements IndexerOffsetLength {
this.offset = 0;
this.recordLength = 0;
if (this.tempCdxDir) {
if (this.warcCdxDir) {
this.indexer = new CDXIndexer({ format: "cdxj" });
}
@ -112,14 +110,19 @@ export class WARCWriter implements IndexerOffsetLength {
flags: "a",
});
}
if (!this.cdxFH && this.tempCdxDir) {
if (!this.cdxFH && this.warcCdxDir) {
this.cdxFH = fs.createWriteStream(
path.join(this.tempCdxDir, this.filename + ".cdx"),
path.join(this.warcCdxDir, this.filename + ".cdx"),
{ flags: "a" },
);
}
fh.write(await createWARCInfo(this.filename));
const buffer = await createWARCInfo(this.filename);
fh.write(buffer);
// account for size of warcinfo record, (don't index as warcinfo never added to cdx)
this.recordLength = buffer.length;
this.offset += buffer.length;
return fh;
}

View file

@ -3,20 +3,25 @@ import fs from "fs";
import path from "path";
import md5 from "md5";
const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args);
test("ensure basic crawl run with docker run passes", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
);
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
);
child_process.execSync(
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
);
});
testIf(doValidate, "validate wacz", () => {
child_process.execSync(
"wacz validate --file ./test-crawls/collections/wr-net/wr-net.wacz",
);
});
test("check that individual WARCs have correct prefix and are under rollover size", () => {
const archiveWarcLists = fs.readdirSync(
"test-crawls/collections/wr-net/archive",

View file

@ -1,13 +1,18 @@
import child_process from "child_process";
import fs from "fs";
const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args);
test("ensure multi url crawl run with docker run passes", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
);
});
testIf(doValidate, "validate multi url crawl wacz", () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
"wacz validate --file ./test-crawls/collections/advanced/advanced.wacz",
);
});

View file

@ -127,7 +127,7 @@ function validateResourcesIndex(json) {
mime: "image/vnd.microsoft.icon",
type: "other",
},
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net":
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net&r=null":
{ status: 202, mime: "text/plain", type: "xhr" },
});
}
@ -172,7 +172,7 @@ function validateResourcesAbout(json) {
{ status: 200, mime: "font/woff2", type: "font" },
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
{ status: 200, mime: "font/woff2", type: "font" },
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net":
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net&r=null":
{
status: 0,
type: "xhr",

View file

@ -1,19 +0,0 @@
import child_process from "child_process";
test("ensure crawl run with redis passes", async () => {
const redis = child_process.spawn(
"docker run -d --name test-crawl-redis -p 6379:6379 redis",
);
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2",
);
redis.kill("SIGINT");
});
test("check that wacz created is valid", () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
);
});

View file

@ -102,7 +102,7 @@
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
"skipLibCheck": false /* Skip type checking all .d.ts files. */
},
"include": ["src/**/*"]

View file

@ -1148,6 +1148,11 @@
dependencies:
undici-types "~5.25.1"
"@types/pako@^1.0.7":
version "1.0.7"
resolved "https://registry.yarnpkg.com/@types/pako/-/pako-1.0.7.tgz#aa0e4af9855d81153a29ff84cc44cce25298eda9"
integrity sha512-YBtzT2ztNF6R/9+UXj2wTGFnC9NklAnASt3sC0h2m1bbH7G6FyBIkt4AN8ThZpNfxUo1b2iMVO0UawiJymEt8A==
"@types/pixelmatch@^5.2.6":
version "5.2.6"
resolved "https://registry.yarnpkg.com/@types/pixelmatch/-/pixelmatch-5.2.6.tgz#fba6de304ac958495f27d85989f5c6bb7499a686"
@ -1179,6 +1184,13 @@
resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.0.tgz#7036640b4e21cc2f259ae826ce843d277dad8cff"
integrity sha512-RJJrrySY7A8havqpGObOB4W92QXKJo63/jFLLgpvOtsGUqbQZ9Sbgl35KMm1DjC6j7AvmmU2bIno+3IyEaemaw==
"@types/stream-buffers@^3.0.7":
version "3.0.7"
resolved "https://registry.yarnpkg.com/@types/stream-buffers/-/stream-buffers-3.0.7.tgz#0b719fa1bd2ca2cc0908205a440e5e569e1aa21e"
integrity sha512-azOCy05sXVXrO+qklf0c/B07H/oHaIuDDAiHPVwlk3A9Ek+ksHyTeMajLZl3r76FxpPpxem//4Te61G1iW3Giw==
dependencies:
"@types/node" "*"
"@types/uuid@^9.0.6":
version "9.0.6"
resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-9.0.6.tgz#c91ae743d8344a54b2b0c691195f5ff5265f6dfb"
@ -1300,15 +1312,15 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.19.7":
version "2.19.7"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.7.tgz#3afe48f79752bcd189cffd5d5e6a8dbe4f394053"
integrity sha512-X9UFxWCww1KWDnAaEjg7vpg6SznBov5a88FPxbOvo5yCT/UkJcQHaa0qo1L52l46sIAUnSbsYz1ur9yMd6ygVA==
"@webrecorder/wabac@^2.19.8":
version "2.19.8"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.8.tgz#302ade200953a1c76f0b355983ae4081428fb933"
integrity sha512-WjyfsGK8JWKeeDsGrOIT8ZLjMcOOAN93OMnRLO214jSV18SHEOY4JRvXzFOLF+OWYC5kJIMjl05gurTLq18jOA==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
"@peculiar/x509" "^1.9.2"
"@webrecorder/wombat" "^3.7.14"
"@webrecorder/wombat" "^3.8.0"
acorn "^8.10.0"
auto-js-ipfs "^2.1.1"
base64-js "^1.5.1"
@ -1327,14 +1339,14 @@
path-parser "^6.1.0"
process "^0.11.10"
stream-browserify "^3.0.0"
warcio "^2.2.1"
warcio "^2.3.0"
"@webrecorder/wombat@^3.7.14":
version "3.7.14"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.14.tgz#3779e4cadb256755bbbfd2960805965ec4daacd8"
integrity sha512-sDNH+c8WstQrK91y8kIPJh1XAC2WXLU5rC8wztANzK1mVzA7v6XB5gk3Yp7OIAn4bn1XuGRVjubhKhmxVVZ9kg==
"@webrecorder/wombat@^3.8.0":
version "3.8.0"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.0.tgz#63ed3df199f11223b23c9ce66202590b8511ae2a"
integrity sha512-MpzNu9+ClCHjOER9XCrsEIsJk15L6qGO+PxeBPiOtaFJmNUiz0auMT5AQwiPqJgKEAniZTlPx1O4kNCVJu9f2Q==
dependencies:
warcio "^2.2.0"
warcio "^2.3.0"
"@zxing/text-encoding@0.9.0":
version "0.9.0"
@ -1380,6 +1392,11 @@ ansi-escapes@^4.2.1:
dependencies:
type-fest "^0.21.3"
ansi-regex@^4.1.0:
version "4.1.1"
resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-4.1.1.tgz#164daac87ab2d6f6db3a29875e2d1766582dabed"
integrity sha512-ILlv4k/3f6vfQ4OoP2AGvirOktlQ98ZEL1k9FaQjxa3L1abBgbuTDAdPOpvbGncC0BTVQrl+OM8xZGK6tWXt7g==
ansi-regex@^5.0.0:
version "5.0.0"
resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.0.tgz#388539f55179bf39339c81af30a654d69f87cb75"
@ -1820,6 +1837,11 @@ cjs-module-lexer@^1.0.0:
resolved "https://registry.yarnpkg.com/cjs-module-lexer/-/cjs-module-lexer-1.2.2.tgz#9f84ba3244a512f3a54e5277e8eef4c489864e40"
integrity sha512-cOU9usZw8/dXIXKtwa8pM0OTJQuJkxMN6w30csNRUerHfeQ5R6U3kkU/FtJeIf3M202OHfY2U8ccInBG7/xogA==
client-zip@^2.4.5:
version "2.4.5"
resolved "https://registry.yarnpkg.com/client-zip/-/client-zip-2.4.5.tgz#c9b6190abca57b8b4d6dcfd21c3a1f4d4ab3bc68"
integrity sha512-4y4d5ZeTH/szIAMQeC8ju67pxtvj+3u20wMGwOFrZk+pegy3aSEA2JkwgC8XVDTXP/Iqn1gyqNQXmkyBp4KLEQ==
cliui@^8.0.1:
version "8.0.1"
resolved "https://registry.yarnpkg.com/cliui/-/cliui-8.0.1.tgz#0c04b075db02cbfe60dc8e6cf2f5486b1a3608aa"
@ -4816,16 +4838,16 @@ string-length@^4.0.1:
char-regex "^1.0.2"
strip-ansi "^6.0.0"
string-width@^4.1.0, string-width@^4.2.0:
version "4.2.2"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.2.tgz#dafd4f9559a7585cfba529c6a0a4f73488ebd4c5"
integrity sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA==
string-width@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.1.0.tgz#ba846d1daa97c3c596155308063e075ed1c99aff"
integrity sha512-NrX+1dVVh+6Y9dnQ19pR0pP4FiEIlUvdTGn8pw6CKTNq5sgib2nIhmUNT5TAmhWmvKr3WcxBcP3E8nWezuipuQ==
dependencies:
emoji-regex "^8.0.0"
is-fullwidth-code-point "^3.0.0"
strip-ansi "^6.0.0"
strip-ansi "^5.2.0"
string-width@^4.2.3:
string-width@^4.2.0, string-width@^4.2.3:
version "4.2.3"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
@ -4870,6 +4892,13 @@ string_decoder@^1.1.1:
dependencies:
safe-buffer "~5.2.0"
strip-ansi@^5.2.0:
version "5.2.0"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-5.2.0.tgz#8c9a536feb6afc962bdfa5b104a5091c1ad9c0ae"
integrity sha512-DuRs1gKbBqsMKIZlrffwlug8MHkcnpjs5VPmL1PAh+mA30U0DTotfDZ0d2UUsXpPmPmMMJ6W773MaA3J+lbiWA==
dependencies:
ansi-regex "^4.1.0"
strip-ansi@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.0.tgz#0b1571dd7669ccd4f3e06e14ef1eed26225ae532"
@ -4878,6 +4907,7 @@ strip-ansi@^6.0.0:
ansi-regex "^5.0.0"
strip-ansi@^6.0.1:
name strip-ansi-cjs
version "6.0.1"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
@ -5247,11 +5277,13 @@ walker@^1.0.8:
dependencies:
makeerror "1.0.12"
warcio@^2.2.0, warcio@^2.2.1:
version "2.2.1"
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.2.1.tgz#3619728fde716291c9b364744c276362a94bacec"
integrity sha512-KPLoz3aFtdTjexG+QQaubMyuLiNANzvcadGMyNKdpcmhl0k6lBHQQVpxZw3Hx9+4pbyqDXyiF4cr/h2tS8kvcw==
warcio@^2.3.0:
version "2.3.0"
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.0.tgz#a655df9b5986a53e5d05aa68cda51bfefdfa8347"
integrity sha512-PCHcZ/fDE5+QECOFe/n/vzyDmAITJ1mvLx1jVONJ0uaV9OwcTbIWoh7Z0+OQwQdq8Wr1Nnb2hwhtHJ7J+9rHIQ==
dependencies:
"@types/pako" "^1.0.7"
"@types/stream-buffers" "^3.0.7"
base32-encode "^2.0.0"
hash-wasm "^4.9.0"
pako "^1.0.11"
@ -5305,7 +5337,7 @@ which@^2.0.1:
dependencies:
isexe "^2.0.0"
wrap-ansi@^7.0.0:
wrap-ansi@7.0.0, wrap-ansi@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==