mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Streaming in-place WACZ creation + CDXJ indexing (#673)
Fixes #674 This PR supersedes #505, and instead of using js-wacz for optimized WACZ creation: - generates an 'in-place' or 'streaming' WACZ in the crawler, without having to copy the data again. - WACZ contents are streamed to remote upload (or to disk) from existing files on disk - CDXJ indices per-WARC are first written to 'warc-cdx' directory, then merged using the linux 'sort' command, and compressed to ZipNum if >50K (or always if using --generateCDX) - All data in the WARCs is written and read only once - Should result in significant speed / disk usage improvements: previously WARC was written once, then read again (for CDXJ indexing), read again (for adding to new WACZ ZIP), written to disk (into new WACZ ZIP), read again (if upload to remote endpoint). Now, WARCs are written once, along with the per-WARC CDXJ, the CDXJ only is reread, sorted and merged on-disk, and all data is read once to either generate WACZ on disk or upload to remote. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
8934feaf70
commit
85a07aff18
15 changed files with 650 additions and 203 deletions
5
.github/workflows/ci.yaml
vendored
5
.github/workflows/ci.yaml
vendored
|
@ -60,8 +60,11 @@ jobs:
|
|||
- name: add http-server for tests
|
||||
run: yarn add -D http-server
|
||||
|
||||
- name: install py-wacz as root for tests
|
||||
run: sudo pip install wacz
|
||||
|
||||
- name: run all tests as root
|
||||
run: sudo DOCKER_HOST_NAME=172.17.0.1 yarn test
|
||||
run: sudo DOCKER_HOST_NAME=172.17.0.1 yarn test -validate
|
||||
|
||||
- name: run saved state + qa compare test as non-root - with volume owned by current user
|
||||
run: |
|
||||
|
|
|
@ -17,13 +17,6 @@ EXPOSE 9222 9223 6080
|
|||
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt /app/
|
||||
RUN python3 -m venv /app/python-venv && \
|
||||
/app/python-venv/bin/pip install -U setuptools && \
|
||||
/app/python-venv/bin/pip install -r requirements.txt && \
|
||||
ln -s /app/python-venv/bin/wacz /usr/bin/wacz && \
|
||||
ln -s /app/python-venv/bin/cdxj-indexer /usr/bin/cdxj-indexer
|
||||
|
||||
ADD package.json yarn.lock /app/
|
||||
|
||||
# to allow forcing rebuilds from this stage
|
||||
|
|
10
package.json
10
package.json
|
@ -17,9 +17,9 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"@novnc/novnc": "^1.4.0",
|
||||
"@types/sax": "^1.2.7",
|
||||
"@webrecorder/wabac": "^2.19.7",
|
||||
"@webrecorder/wabac": "^2.19.8",
|
||||
"browsertrix-behaviors": "^0.6.4",
|
||||
"client-zip": "^2.4.5",
|
||||
"fetch-socks": "^1.3.0",
|
||||
"get-folder-size": "^4.0.0",
|
||||
"husky": "^8.0.3",
|
||||
|
@ -36,7 +36,7 @@
|
|||
"tsc": "^2.0.4",
|
||||
"undici": "^6.18.2",
|
||||
"uuid": "8.3.2",
|
||||
"warcio": "^2.2.1",
|
||||
"warcio": "^2.3.0",
|
||||
"ws": "^7.4.4",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
|
@ -46,6 +46,7 @@
|
|||
"@types/node": "^20.8.7",
|
||||
"@types/pixelmatch": "^5.2.6",
|
||||
"@types/pngjs": "^6.0.4",
|
||||
"@types/sax": "^1.2.7",
|
||||
"@types/uuid": "^9.0.6",
|
||||
"@types/ws": "^8.5.8",
|
||||
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
||||
|
@ -62,5 +63,8 @@
|
|||
"jest": {
|
||||
"transform": {},
|
||||
"testTimeout": 90000
|
||||
},
|
||||
"resolutions": {
|
||||
"wrap-ansi": "7.0.0"
|
||||
}
|
||||
}
|
||||
|
|
204
src/crawler.ts
204
src/crawler.ts
|
@ -16,6 +16,8 @@ import { parseArgs } from "./util/argParser.js";
|
|||
|
||||
import yaml from "js-yaml";
|
||||
|
||||
import { WACZ, WACZInitOpts, mergeCDXJ } from "./util/wacz.js";
|
||||
|
||||
import { HealthChecker } from "./util/healthcheck.js";
|
||||
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
||||
import {
|
||||
|
@ -62,7 +64,12 @@ import {
|
|||
import { Recorder } from "./util/recorder.js";
|
||||
import { SitemapReader } from "./util/sitemapper.js";
|
||||
import { ScopedSeed } from "./util/seeds.js";
|
||||
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
||||
import {
|
||||
WARCWriter,
|
||||
createWARCInfo,
|
||||
setWARCInfo,
|
||||
streamFinish,
|
||||
} from "./util/warcwriter.js";
|
||||
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
|
||||
import { initProxy } from "./util/proxy.js";
|
||||
|
||||
|
@ -117,7 +124,7 @@ export class Crawler {
|
|||
|
||||
pagesFH?: WriteStream | null = null;
|
||||
extraPagesFH?: WriteStream | null = null;
|
||||
logFH!: WriteStream;
|
||||
logFH: WriteStream | null = null;
|
||||
|
||||
crawlId: string;
|
||||
|
||||
|
@ -150,7 +157,8 @@ export class Crawler {
|
|||
|
||||
archivesDir: string;
|
||||
tempdir: string;
|
||||
tempCdxDir: string;
|
||||
warcCdxDir: string;
|
||||
indexesDir: string;
|
||||
|
||||
screenshotWriter: WARCWriter | null;
|
||||
textWriter: WARCWriter | null;
|
||||
|
@ -288,7 +296,10 @@ export class Crawler {
|
|||
// archives dir
|
||||
this.archivesDir = path.join(this.collDir, "archive");
|
||||
this.tempdir = path.join(os.tmpdir(), "tmp-dl");
|
||||
this.tempCdxDir = path.join(this.collDir, "tmp-cdx");
|
||||
|
||||
// indexes dirs
|
||||
this.warcCdxDir = path.join(this.collDir, "warc-cdx");
|
||||
this.indexesDir = path.join(this.collDir, "indexes");
|
||||
|
||||
this.screenshotWriter = null;
|
||||
this.textWriter = null;
|
||||
|
@ -470,7 +481,7 @@ export class Crawler {
|
|||
if (!this.params.dryRun) {
|
||||
await fsp.mkdir(this.archivesDir, { recursive: true });
|
||||
await fsp.mkdir(this.tempdir, { recursive: true });
|
||||
await fsp.mkdir(this.tempCdxDir, { recursive: true });
|
||||
await fsp.mkdir(this.warcCdxDir, { recursive: true });
|
||||
}
|
||||
|
||||
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
|
||||
|
@ -1478,36 +1489,24 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.combineWARC();
|
||||
}
|
||||
|
||||
if (this.params.generateCDX && !this.params.dryRun) {
|
||||
logger.info("Generating CDX");
|
||||
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
|
||||
await this.crawlState.setStatus("generate-cdx");
|
||||
|
||||
const warcList = await fsp.readdir(this.archivesDir);
|
||||
const warcListFull = warcList.map((filename) =>
|
||||
path.join(this.archivesDir, filename),
|
||||
);
|
||||
|
||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
const params = [
|
||||
"-o",
|
||||
path.join(this.collDir, "indexes", "index.cdxj"),
|
||||
...warcListFull,
|
||||
];
|
||||
const indexResult = await this.awaitProcess(
|
||||
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
|
||||
);
|
||||
if (indexResult === 0) {
|
||||
logger.debug("Indexing complete, CDX successfully created");
|
||||
} else {
|
||||
logger.error("Error indexing and generating CDX", {
|
||||
"status code": indexResult,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Crawling done");
|
||||
|
||||
if (
|
||||
(this.params.generateCDX || this.params.generateWACZ) &&
|
||||
!this.params.dryRun
|
||||
) {
|
||||
logger.info("Merging CDX");
|
||||
await this.crawlState.setStatus(
|
||||
this.params.generateWACZ ? "generate-wacz" : "generate-cdx",
|
||||
);
|
||||
|
||||
await mergeCDXJ(
|
||||
this.warcCdxDir,
|
||||
this.indexesDir,
|
||||
this.params.generateWACZ ? null : false,
|
||||
);
|
||||
}
|
||||
|
||||
if (
|
||||
this.params.generateWACZ &&
|
||||
!this.params.dryRun &&
|
||||
|
@ -1543,11 +1542,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (!this.logFH) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await new Promise<void>((resolve) => this.logFH.close(() => resolve()));
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
const logFH = this.logFH;
|
||||
this.logFH = null;
|
||||
await streamFinish(logFH);
|
||||
}
|
||||
|
||||
async generateWACZ() {
|
||||
|
@ -1577,110 +1574,67 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.fatal("No WARC Files, assuming crawl failed");
|
||||
}
|
||||
|
||||
logger.debug("End of log file, storing logs in WACZ");
|
||||
const waczPath = path.join(this.collDir, this.params.collection + ".wacz");
|
||||
|
||||
// Build the argument list to pass to the wacz create command
|
||||
const waczFilename = this.params.collection.concat(".wacz");
|
||||
const waczPath = path.join(this.collDir, waczFilename);
|
||||
const streaming = !!this.storage;
|
||||
|
||||
const createArgs = [
|
||||
"create",
|
||||
"-o",
|
||||
waczPath,
|
||||
"--pages",
|
||||
this.seedPagesFile,
|
||||
"--extra-pages",
|
||||
this.otherPagesFile,
|
||||
"--copy-pages",
|
||||
"--log-directory",
|
||||
this.logDir,
|
||||
];
|
||||
if (!streaming) {
|
||||
logger.debug("WACZ will be written to disk", { path: waczPath }, "wacz");
|
||||
} else {
|
||||
logger.debug("WACZ will be stream uploaded to remote storage");
|
||||
}
|
||||
|
||||
logger.debug("End of log file in WACZ, storing logs to WACZ file");
|
||||
|
||||
await this.closeLog();
|
||||
|
||||
const waczOpts: WACZInitOpts = {
|
||||
input: warcFileList.map((x) => path.join(this.archivesDir, x)),
|
||||
output: waczPath,
|
||||
pages: this.pagesDir,
|
||||
logDirectory: this.logDir,
|
||||
warcCdxDir: this.warcCdxDir,
|
||||
indexesDir: this.indexesDir,
|
||||
softwareString: this.infoString,
|
||||
};
|
||||
|
||||
if (process.env.WACZ_SIGN_URL) {
|
||||
createArgs.push("--signing-url");
|
||||
createArgs.push(process.env.WACZ_SIGN_URL);
|
||||
waczOpts.signingUrl = process.env.WACZ_SIGN_URL;
|
||||
if (process.env.WACZ_SIGN_TOKEN) {
|
||||
createArgs.push("--signing-token");
|
||||
createArgs.push(process.env.WACZ_SIGN_TOKEN);
|
||||
waczOpts.signingToken = "bearer " + process.env.WACZ_SIGN_TOKEN;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.params.title) {
|
||||
createArgs.push("--title");
|
||||
createArgs.push(this.params.title);
|
||||
waczOpts.title = this.params.title;
|
||||
}
|
||||
|
||||
if (this.params.description) {
|
||||
createArgs.push("--desc");
|
||||
createArgs.push(this.params.description);
|
||||
waczOpts.description = this.params.description;
|
||||
}
|
||||
|
||||
createArgs.push("-f");
|
||||
try {
|
||||
const wacz = new WACZ(waczOpts, this.collDir);
|
||||
if (!streaming) {
|
||||
await wacz.generateToFile(waczPath);
|
||||
}
|
||||
|
||||
warcFileList.forEach((val) =>
|
||||
createArgs.push(path.join(this.archivesDir, val)),
|
||||
);
|
||||
if (this.storage) {
|
||||
await this.crawlState.setStatus("uploading-wacz");
|
||||
const filename = process.env.STORE_FILENAME || "@ts-@id.wacz";
|
||||
const targetFilename = interpolateFilename(filename, this.crawlId);
|
||||
|
||||
// create WACZ
|
||||
const waczResult = await this.awaitProcess(
|
||||
child_process.spawn("wacz", createArgs, { detached: RUN_DETACHED }),
|
||||
);
|
||||
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (waczResult !== 0) {
|
||||
logger.error("Error creating WACZ", { "status code": waczResult });
|
||||
logger.fatal("Unable to write WACZ successfully");
|
||||
return false;
|
||||
} catch (e) {
|
||||
logger.error("Error creating WACZ", e);
|
||||
if (!streaming) {
|
||||
logger.fatal("Unable to write WACZ successfully");
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
|
||||
|
||||
// Verify WACZ
|
||||
/*
|
||||
const validateArgs = ["validate"];
|
||||
validateArgs.push("-f");
|
||||
validateArgs.push(waczPath);
|
||||
|
||||
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));
|
||||
|
||||
if (waczVerifyResult !== 0) {
|
||||
console.log("validate", waczVerifyResult);
|
||||
logger.fatal("Unable to verify WACZ created successfully");
|
||||
}
|
||||
*/
|
||||
if (this.storage) {
|
||||
await this.crawlState.setStatus("uploading-wacz");
|
||||
const filename = process.env.STORE_FILENAME || "@ts-@id.wacz";
|
||||
const targetFilename = interpolateFilename(filename, this.crawlId);
|
||||
|
||||
await this.storage.uploadCollWACZ(waczPath, targetFilename, isFinished);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
awaitProcess(proc: ChildProcess) {
|
||||
const stdout: string[] = [];
|
||||
const stderr: string[] = [];
|
||||
|
||||
proc.stdout!.on("data", (data) => {
|
||||
stdout.push(data.toString());
|
||||
});
|
||||
|
||||
proc.stderr!.on("data", (data) => {
|
||||
stderr.push(data.toString());
|
||||
});
|
||||
|
||||
return new Promise((resolve) => {
|
||||
proc.on("close", (code) => {
|
||||
if (stdout.length) {
|
||||
logger.debug(stdout.join("\n"));
|
||||
}
|
||||
if (stderr.length && this.params.logging.includes("debug")) {
|
||||
logger.debug(stderr.join("\n"));
|
||||
}
|
||||
resolve(code);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
logMemory() {
|
||||
|
@ -2604,7 +2558,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
return new WARCWriter({
|
||||
archivesDir: this.archivesDir,
|
||||
tempCdxDir: this.tempCdxDir,
|
||||
warcCdxDir: this.warcCdxDir,
|
||||
filenameTemplate,
|
||||
rolloverSize: this.params.rolloverSize,
|
||||
gzip,
|
||||
|
|
|
@ -201,7 +201,7 @@ class ArgParser {
|
|||
|
||||
generateWACZ: {
|
||||
alias: ["generatewacz", "generateWacz"],
|
||||
describe: "If set, generate wacz",
|
||||
describe: "If set, generate WACZ on disk",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
|
|
@ -51,6 +51,7 @@ export const LOG_CONTEXT_TYPES = [
|
|||
"crawlStatus",
|
||||
"links",
|
||||
"sitemap",
|
||||
"wacz",
|
||||
"replay",
|
||||
"proxy",
|
||||
] as const;
|
||||
|
|
|
@ -14,6 +14,8 @@ import { logger } from "./logger.js";
|
|||
// @ts-expect-error (incorrect types on get-folder-size)
|
||||
import getFolderSize from "get-folder-size";
|
||||
|
||||
import { WACZ } from "./wacz.js";
|
||||
|
||||
const DEFAULT_REGION = "us-east-1";
|
||||
|
||||
// ===========================================================================
|
||||
|
@ -81,6 +83,32 @@ export class S3StorageSync {
|
|||
this.webhookUrl = webhookUrl;
|
||||
}
|
||||
|
||||
async uploadStreamingWACZ(wacz: WACZ, targetFilename: string) {
|
||||
const fileUploadInfo = {
|
||||
bucket: this.bucketName,
|
||||
crawlId: this.crawlId,
|
||||
prefix: this.objectPrefix,
|
||||
targetFilename,
|
||||
};
|
||||
logger.info("S3 file upload information", fileUploadInfo, "storage");
|
||||
|
||||
const waczStream = wacz.generate();
|
||||
|
||||
await this.client.putObject(
|
||||
this.bucketName,
|
||||
this.objectPrefix + targetFilename,
|
||||
waczStream,
|
||||
);
|
||||
|
||||
const hash = wacz.getHash();
|
||||
const path = targetFilename;
|
||||
|
||||
const size = wacz.getSize();
|
||||
|
||||
// for backwards compatibility, keep 'bytes'
|
||||
return { path, size, hash, bytes: size };
|
||||
}
|
||||
|
||||
async uploadFile(srcFilename: string, targetFilename: string) {
|
||||
const fileUploadInfo = {
|
||||
bucket: this.bucketName,
|
||||
|
@ -114,11 +142,15 @@ export class S3StorageSync {
|
|||
}
|
||||
|
||||
async uploadCollWACZ(
|
||||
srcFilename: string,
|
||||
srcOrWACZ: string | WACZ,
|
||||
targetFilename: string,
|
||||
completed = true,
|
||||
) {
|
||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||
const resource =
|
||||
typeof srcOrWACZ === "string"
|
||||
? await this.uploadFile(srcOrWACZ, targetFilename)
|
||||
: await this.uploadStreamingWACZ(srcOrWACZ, targetFilename);
|
||||
|
||||
logger.info(
|
||||
"WACZ S3 file upload resource",
|
||||
{ targetFilename, resource },
|
||||
|
@ -191,7 +223,7 @@ export async function getFileSize(filename: string) {
|
|||
return stats.size;
|
||||
}
|
||||
|
||||
export async function getDirSize(dir: string) {
|
||||
export async function getDirSize(dir: string): Promise<number> {
|
||||
const { size, errors } = await getFolderSize(dir);
|
||||
if (errors && errors.length) {
|
||||
logger.warn("Size check errors", { errors }, "storage");
|
||||
|
@ -234,10 +266,15 @@ export async function checkDiskUtilization(
|
|||
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
||||
|
||||
let kbArchiveDirSize = Math.round(archiveDirSize / 1024);
|
||||
if (params.combineWARC && params.generateWACZ) {
|
||||
kbArchiveDirSize *= 4;
|
||||
} else if (params.combineWARC || params.generateWACZ) {
|
||||
kbArchiveDirSize *= 2;
|
||||
|
||||
// assume if has STORE_ENDPOINT_URL, will be uploading to remote
|
||||
// and not storing local copy of either WACZ or WARC
|
||||
if (!process.env.STORE_ENDPOINT_URL) {
|
||||
if (params.combineWARC && params.generateWACZ) {
|
||||
kbArchiveDirSize *= 4;
|
||||
} else if (params.combineWARC || params.generateWACZ) {
|
||||
kbArchiveDirSize *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
const projectedTotal = kbUsed + kbArchiveDirSize;
|
||||
|
|
429
src/util/wacz.ts
Normal file
429
src/util/wacz.ts
Normal file
|
@ -0,0 +1,429 @@
|
|||
import path, { basename } from "node:path";
|
||||
import fs from "node:fs";
|
||||
import fsp from "node:fs/promises";
|
||||
import { Writable, Readable } from "node:stream";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import readline from "node:readline";
|
||||
import child_process from "node:child_process";
|
||||
|
||||
import { createHash, Hash } from "node:crypto";
|
||||
|
||||
import { gzip } from "node:zlib";
|
||||
|
||||
import { ReadableStream } from "node:stream/web";
|
||||
|
||||
import { makeZip, InputWithoutMeta } from "client-zip";
|
||||
import { logger, formatErr } from "./logger.js";
|
||||
import { streamFinish } from "./warcwriter.js";
|
||||
import { getDirSize } from "./storage.js";
|
||||
|
||||
const DATAPACKAGE_JSON = "datapackage.json";
|
||||
const DATAPACKAGE_DIGEST_JSON = "datapackage-digest.json";
|
||||
|
||||
const INDEX_CDXJ = "index.cdxj";
|
||||
const INDEX_IDX = "index.idx";
|
||||
const INDEX_CDX_GZ = "index.cdx.gz";
|
||||
|
||||
const LINES_PER_BLOCK = 256;
|
||||
|
||||
const ZIP_CDX_MIN_SIZE = 50_000;
|
||||
|
||||
// ============================================================================
|
||||
export type WACZInitOpts = {
|
||||
input: string[];
|
||||
output: string;
|
||||
pages: string;
|
||||
warcCdxDir: string;
|
||||
indexesDir: string;
|
||||
logDirectory: string;
|
||||
|
||||
softwareString: string;
|
||||
|
||||
signingUrl?: string;
|
||||
signingToken?: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
};
|
||||
|
||||
export type WACZResourceEntry = {
|
||||
name: string;
|
||||
path: string;
|
||||
hash: string;
|
||||
bytes: number;
|
||||
};
|
||||
|
||||
export type WACZDataPackage = {
|
||||
resources: WACZResourceEntry[];
|
||||
created: string;
|
||||
wacz_version: string;
|
||||
software: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
};
|
||||
|
||||
type WACZDigest = {
|
||||
path: string;
|
||||
hash: string;
|
||||
signedData?: string;
|
||||
};
|
||||
|
||||
class CurrZipFileMarker extends Uint8Array {
|
||||
// empty array to mark start of WACZ file, also track metadata per-file
|
||||
filename: string;
|
||||
zipPath: string;
|
||||
size: number;
|
||||
hasher: Hash;
|
||||
|
||||
constructor(filename: string, zipPath: string, size: number) {
|
||||
super();
|
||||
this.filename = filename;
|
||||
this.zipPath = zipPath;
|
||||
this.size = size;
|
||||
this.hasher = createHash("sha256");
|
||||
}
|
||||
}
|
||||
|
||||
class EndOfZipFileMarker extends Uint8Array {
|
||||
// empty array to mark end of WACZ file
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
export class WACZ {
|
||||
collDir: string;
|
||||
|
||||
warcs: string[];
|
||||
|
||||
pagesDir: string;
|
||||
logsDir: string;
|
||||
warcCdxDir: string;
|
||||
indexesDir: string;
|
||||
|
||||
datapackage: WACZDataPackage;
|
||||
|
||||
signingUrl: string | null;
|
||||
signingToken: string | null;
|
||||
|
||||
private size = 0;
|
||||
private hash: string = "";
|
||||
|
||||
constructor(config: WACZInitOpts, collDir: string) {
|
||||
this.warcs = config.input;
|
||||
this.pagesDir = config.pages;
|
||||
this.logsDir = config.logDirectory;
|
||||
this.warcCdxDir = config.warcCdxDir;
|
||||
this.collDir = collDir;
|
||||
this.indexesDir = config.indexesDir;
|
||||
|
||||
this.datapackage = {
|
||||
resources: [],
|
||||
// drop microseconds
|
||||
created: new Date().toISOString().split(".", 1)[0] + "Z",
|
||||
wacz_version: "1.1.1",
|
||||
software: config.softwareString,
|
||||
};
|
||||
|
||||
if (config.title) {
|
||||
this.datapackage.title = config.title;
|
||||
}
|
||||
if (config.description) {
|
||||
this.datapackage.description = config.description;
|
||||
}
|
||||
|
||||
this.signingUrl = config.signingUrl || null;
|
||||
this.signingToken = config.signingToken || null;
|
||||
}
|
||||
|
||||
generate(): Readable {
|
||||
const files = [
|
||||
...this.warcs,
|
||||
...addDirFiles(this.indexesDir),
|
||||
...addDirFiles(this.pagesDir),
|
||||
...addDirFiles(this.logsDir),
|
||||
];
|
||||
|
||||
const zip = makeZip(
|
||||
this.iterDirForZip(files),
|
||||
) as ReadableStream<Uint8Array>;
|
||||
|
||||
const hasher = createHash("sha256");
|
||||
const resources = this.datapackage.resources;
|
||||
|
||||
let size = 0;
|
||||
|
||||
async function* iterWACZ(wacz: WACZ): AsyncIterable<Uint8Array> {
|
||||
let currFile: CurrZipFileMarker | null = null;
|
||||
|
||||
for await (const chunk of zip) {
|
||||
if (chunk instanceof CurrZipFileMarker) {
|
||||
currFile = chunk;
|
||||
} else if (chunk instanceof EndOfZipFileMarker) {
|
||||
if (currFile) {
|
||||
// Frictionless data validation requires this to be lowercase
|
||||
const name = basename(currFile.filename).toLowerCase();
|
||||
const path = currFile.zipPath;
|
||||
const bytes = currFile.size;
|
||||
const hash = "sha256:" + currFile.hasher.digest("hex");
|
||||
resources.push({ name, path, bytes, hash });
|
||||
logger.debug("Added file to WACZ", { path, bytes, hash }, "wacz");
|
||||
}
|
||||
currFile = null;
|
||||
} else {
|
||||
yield chunk;
|
||||
if (currFile) {
|
||||
currFile.hasher.update(chunk);
|
||||
}
|
||||
hasher.update(chunk);
|
||||
size += chunk.length;
|
||||
}
|
||||
}
|
||||
|
||||
wacz.hash = hasher.digest("hex");
|
||||
wacz.size = size;
|
||||
}
|
||||
|
||||
return Readable.from(iterWACZ(this));
|
||||
}
|
||||
|
||||
getHash() {
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
getSize() {
|
||||
return this.size;
|
||||
}
|
||||
|
||||
async generateToFile(filename: string) {
|
||||
await pipeline(this.generate(), fs.createWriteStream(filename));
|
||||
}
|
||||
|
||||
async *iterDirForZip(files: string[]): AsyncGenerator<InputWithoutMeta> {
|
||||
const encoder = new TextEncoder();
|
||||
const end = new EndOfZipFileMarker();
|
||||
|
||||
async function* wrapMarkers(
|
||||
start: CurrZipFileMarker,
|
||||
iter: AsyncIterable<Uint8Array>,
|
||||
) {
|
||||
yield start;
|
||||
yield* iter;
|
||||
yield end;
|
||||
}
|
||||
|
||||
async function* getData(data: Uint8Array) {
|
||||
yield data;
|
||||
}
|
||||
|
||||
for (const filename of files) {
|
||||
const input = fs.createReadStream(filename);
|
||||
|
||||
const stat = await fsp.stat(filename);
|
||||
const lastModified = stat.mtime;
|
||||
const size = stat.size;
|
||||
|
||||
const nameStr = filename.slice(this.collDir.length + 1);
|
||||
const name = encoder.encode(nameStr);
|
||||
|
||||
const currFile = new CurrZipFileMarker(filename, nameStr, size);
|
||||
|
||||
yield { input: wrapMarkers(currFile, input), lastModified, name, size };
|
||||
}
|
||||
|
||||
// datapackage.json
|
||||
|
||||
const datapackageData = encoder.encode(
|
||||
JSON.stringify(this.datapackage, null, 2),
|
||||
);
|
||||
|
||||
yield {
|
||||
input: getData(datapackageData),
|
||||
lastModified: new Date(),
|
||||
name: DATAPACKAGE_JSON,
|
||||
size: datapackageData.length,
|
||||
};
|
||||
|
||||
const hash =
|
||||
"sha256:" + createHash("sha256").update(datapackageData).digest("hex");
|
||||
|
||||
// datapackage-digest.json
|
||||
|
||||
const digest: WACZDigest = {
|
||||
path: DATAPACKAGE_JSON,
|
||||
hash,
|
||||
};
|
||||
|
||||
// Get Signature
|
||||
if (this.signingUrl) {
|
||||
const body = JSON.stringify({
|
||||
hash,
|
||||
created: this.datapackage.created,
|
||||
});
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
"Content-Type": "application/json",
|
||||
};
|
||||
|
||||
if (this.signingToken) {
|
||||
headers["Authorization"] = this.signingToken;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(this.signingUrl, {
|
||||
method: "POST",
|
||||
headers,
|
||||
body,
|
||||
});
|
||||
digest.signedData = await response.json();
|
||||
} catch (e) {
|
||||
logger.warn(
|
||||
"Failed to sign WACZ, continuing w/o signature",
|
||||
{ ...formatErr(e) },
|
||||
"wacz",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const digestData = encoder.encode(JSON.stringify(digest, null, 2));
|
||||
|
||||
yield {
|
||||
input: getData(digestData),
|
||||
lastModified: new Date(),
|
||||
name: DATAPACKAGE_DIGEST_JSON,
|
||||
size: digestData.length,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Merge CDX
|
||||
export function addDirFiles(fullDir: string): string[] {
|
||||
const files = fs.readdirSync(fullDir);
|
||||
return files.map((name) => path.join(fullDir, name));
|
||||
}
|
||||
|
||||
export async function mergeCDXJ(
|
||||
warcCdxDir: string,
|
||||
indexesDir: string,
|
||||
zipped: boolean | null = null,
|
||||
) {
|
||||
async function* readLinesFrom(stdout: Readable): AsyncGenerator<string> {
|
||||
for await (const line of readline.createInterface({ input: stdout })) {
|
||||
yield line + "\n";
|
||||
}
|
||||
}
|
||||
|
||||
async function* generateCompressed(
|
||||
reader: AsyncGenerator<string>,
|
||||
idxFile: Writable,
|
||||
) {
|
||||
let offset = 0;
|
||||
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
const filename = INDEX_CDX_GZ;
|
||||
|
||||
let cdxLines: string[] = [];
|
||||
|
||||
let key = "";
|
||||
let count = 0;
|
||||
|
||||
idxFile.write(
|
||||
`!meta 0 ${JSON.stringify({
|
||||
format: "cdxj-gzip-1.0",
|
||||
filename: INDEX_CDX_GZ,
|
||||
})}\n`,
|
||||
);
|
||||
|
||||
const finishChunk = async () => {
|
||||
const compressed = await new Promise<Uint8Array>((resolve) => {
|
||||
gzip(encoder.encode(cdxLines.join("")), (_, result) => {
|
||||
if (result) {
|
||||
resolve(result);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const length = compressed.length;
|
||||
const digest =
|
||||
"sha256:" + createHash("sha256").update(compressed).digest("hex");
|
||||
|
||||
const idx =
|
||||
key + " " + JSON.stringify({ offset, length, digest, filename });
|
||||
|
||||
idxFile.write(idx + "\n");
|
||||
|
||||
offset += length;
|
||||
|
||||
count = 1;
|
||||
key = "";
|
||||
cdxLines = [];
|
||||
|
||||
return compressed;
|
||||
};
|
||||
|
||||
for await (const cdx of reader) {
|
||||
if (!key) {
|
||||
key = cdx.split(" {", 1)[0];
|
||||
}
|
||||
|
||||
if (++count === LINES_PER_BLOCK) {
|
||||
yield await finishChunk();
|
||||
}
|
||||
cdxLines.push(cdx);
|
||||
}
|
||||
|
||||
if (key) {
|
||||
yield await finishChunk();
|
||||
}
|
||||
}
|
||||
|
||||
await fsp.mkdir(indexesDir, { recursive: true });
|
||||
|
||||
const removeIndexFile = async (filename: string) => {
|
||||
try {
|
||||
await fsp.unlink(path.join(indexesDir, filename));
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
};
|
||||
|
||||
const cdxFiles = addDirFiles(warcCdxDir);
|
||||
|
||||
if (!cdxFiles.length) {
|
||||
logger.info("No CDXJ files to merge");
|
||||
return;
|
||||
}
|
||||
|
||||
if (zipped === null) {
|
||||
const tempCdxSize = await getDirSize(warcCdxDir);
|
||||
|
||||
// if CDX size is at least this size, use compressed version
|
||||
zipped = tempCdxSize >= ZIP_CDX_MIN_SIZE;
|
||||
}
|
||||
|
||||
const proc = child_process.spawn("sort", cdxFiles, {
|
||||
env: { LC_ALL: "C" },
|
||||
});
|
||||
|
||||
if (!zipped) {
|
||||
const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDXJ));
|
||||
|
||||
await pipeline(Readable.from(readLinesFrom(proc.stdout)), output);
|
||||
|
||||
await removeIndexFile(INDEX_IDX);
|
||||
await removeIndexFile(INDEX_CDX_GZ);
|
||||
} else {
|
||||
const output = fs.createWriteStream(path.join(indexesDir, INDEX_CDX_GZ));
|
||||
|
||||
const outputIdx = fs.createWriteStream(path.join(indexesDir, INDEX_IDX), {
|
||||
encoding: "utf-8",
|
||||
});
|
||||
|
||||
await pipeline(
|
||||
Readable.from(generateCompressed(readLinesFrom(proc.stdout), outputIdx)),
|
||||
output,
|
||||
);
|
||||
|
||||
await streamFinish(outputIdx);
|
||||
|
||||
await removeIndexFile(INDEX_CDXJ);
|
||||
}
|
||||
}
|
|
@ -24,7 +24,7 @@ export type ResourceRecordData = {
|
|||
// =================================================================
|
||||
export class WARCWriter implements IndexerOffsetLength {
|
||||
archivesDir: string;
|
||||
tempCdxDir?: string;
|
||||
warcCdxDir?: string;
|
||||
filenameTemplate: string;
|
||||
filename?: string;
|
||||
gzip: boolean;
|
||||
|
@ -45,23 +45,21 @@ export class WARCWriter implements IndexerOffsetLength {
|
|||
|
||||
constructor({
|
||||
archivesDir,
|
||||
tempCdxDir,
|
||||
warcCdxDir,
|
||||
filenameTemplate,
|
||||
rolloverSize = DEFAULT_ROLLOVER_SIZE,
|
||||
gzip,
|
||||
logDetails,
|
||||
}: {
|
||||
archivesDir: string;
|
||||
tempCdxDir?: string;
|
||||
warcCdxDir?: string;
|
||||
filenameTemplate: string;
|
||||
rolloverSize?: number;
|
||||
gzip: boolean;
|
||||
logDetails: Record<string, string>;
|
||||
}) {
|
||||
this.archivesDir = archivesDir;
|
||||
this.tempCdxDir = tempCdxDir;
|
||||
// for now, disabling CDX
|
||||
this.tempCdxDir = undefined;
|
||||
this.warcCdxDir = warcCdxDir;
|
||||
this.logDetails = logDetails;
|
||||
this.gzip = gzip;
|
||||
this.rolloverSize = rolloverSize;
|
||||
|
@ -77,7 +75,7 @@ export class WARCWriter implements IndexerOffsetLength {
|
|||
this.offset = 0;
|
||||
this.recordLength = 0;
|
||||
|
||||
if (this.tempCdxDir) {
|
||||
if (this.warcCdxDir) {
|
||||
this.indexer = new CDXIndexer({ format: "cdxj" });
|
||||
}
|
||||
|
||||
|
@ -112,14 +110,19 @@ export class WARCWriter implements IndexerOffsetLength {
|
|||
flags: "a",
|
||||
});
|
||||
}
|
||||
if (!this.cdxFH && this.tempCdxDir) {
|
||||
if (!this.cdxFH && this.warcCdxDir) {
|
||||
this.cdxFH = fs.createWriteStream(
|
||||
path.join(this.tempCdxDir, this.filename + ".cdx"),
|
||||
path.join(this.warcCdxDir, this.filename + ".cdx"),
|
||||
{ flags: "a" },
|
||||
);
|
||||
}
|
||||
|
||||
fh.write(await createWARCInfo(this.filename));
|
||||
const buffer = await createWARCInfo(this.filename);
|
||||
fh.write(buffer);
|
||||
|
||||
// account for size of warcinfo record, (don't index as warcinfo never added to cdx)
|
||||
this.recordLength = buffer.length;
|
||||
this.offset += buffer.length;
|
||||
|
||||
return fh;
|
||||
}
|
||||
|
|
|
@ -3,20 +3,25 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import md5 from "md5";
|
||||
|
||||
const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
|
||||
const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args);
|
||||
|
||||
test("ensure basic crawl run with docker run passes", async () => {
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
|
||||
);
|
||||
});
|
||||
|
||||
testIf(doValidate, "validate wacz", () => {
|
||||
child_process.execSync(
|
||||
"wacz validate --file ./test-crawls/collections/wr-net/wr-net.wacz",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that individual WARCs have correct prefix and are under rollover size", () => {
|
||||
const archiveWarcLists = fs.readdirSync(
|
||||
"test-crawls/collections/wr-net/archive",
|
||||
|
|
|
@ -1,13 +1,18 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
|
||||
const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args);
|
||||
|
||||
test("ensure multi url crawl run with docker run passes", async () => {
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
|
||||
);
|
||||
});
|
||||
|
||||
testIf(doValidate, "validate multi url crawl wacz", () => {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
|
||||
"wacz validate --file ./test-crawls/collections/advanced/advanced.wacz",
|
||||
);
|
||||
});
|
||||
|
||||
|
|
|
@ -127,7 +127,7 @@ function validateResourcesIndex(json) {
|
|||
mime: "image/vnd.microsoft.icon",
|
||||
type: "other",
|
||||
},
|
||||
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net":
|
||||
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net&r=null":
|
||||
{ status: 202, mime: "text/plain", type: "xhr" },
|
||||
});
|
||||
}
|
||||
|
@ -172,7 +172,7 @@ function validateResourcesAbout(json) {
|
|||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net":
|
||||
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net&r=null":
|
||||
{
|
||||
status: 0,
|
||||
type: "xhr",
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
test("ensure crawl run with redis passes", async () => {
|
||||
const redis = child_process.spawn(
|
||||
"docker run -d --name test-crawl-redis -p 6379:6379 redis",
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2",
|
||||
);
|
||||
|
||||
redis.kill("SIGINT");
|
||||
});
|
||||
|
||||
test("check that wacz created is valid", () => {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
|
||||
);
|
||||
});
|
|
@ -102,7 +102,7 @@
|
|||
|
||||
/* Completeness */
|
||||
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||
"skipLibCheck": false /* Skip type checking all .d.ts files. */
|
||||
},
|
||||
|
||||
"include": ["src/**/*"]
|
||||
|
|
76
yarn.lock
76
yarn.lock
|
@ -1148,6 +1148,11 @@
|
|||
dependencies:
|
||||
undici-types "~5.25.1"
|
||||
|
||||
"@types/pako@^1.0.7":
|
||||
version "1.0.7"
|
||||
resolved "https://registry.yarnpkg.com/@types/pako/-/pako-1.0.7.tgz#aa0e4af9855d81153a29ff84cc44cce25298eda9"
|
||||
integrity sha512-YBtzT2ztNF6R/9+UXj2wTGFnC9NklAnASt3sC0h2m1bbH7G6FyBIkt4AN8ThZpNfxUo1b2iMVO0UawiJymEt8A==
|
||||
|
||||
"@types/pixelmatch@^5.2.6":
|
||||
version "5.2.6"
|
||||
resolved "https://registry.yarnpkg.com/@types/pixelmatch/-/pixelmatch-5.2.6.tgz#fba6de304ac958495f27d85989f5c6bb7499a686"
|
||||
|
@ -1179,6 +1184,13 @@
|
|||
resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.0.tgz#7036640b4e21cc2f259ae826ce843d277dad8cff"
|
||||
integrity sha512-RJJrrySY7A8havqpGObOB4W92QXKJo63/jFLLgpvOtsGUqbQZ9Sbgl35KMm1DjC6j7AvmmU2bIno+3IyEaemaw==
|
||||
|
||||
"@types/stream-buffers@^3.0.7":
|
||||
version "3.0.7"
|
||||
resolved "https://registry.yarnpkg.com/@types/stream-buffers/-/stream-buffers-3.0.7.tgz#0b719fa1bd2ca2cc0908205a440e5e569e1aa21e"
|
||||
integrity sha512-azOCy05sXVXrO+qklf0c/B07H/oHaIuDDAiHPVwlk3A9Ek+ksHyTeMajLZl3r76FxpPpxem//4Te61G1iW3Giw==
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
|
||||
"@types/uuid@^9.0.6":
|
||||
version "9.0.6"
|
||||
resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-9.0.6.tgz#c91ae743d8344a54b2b0c691195f5ff5265f6dfb"
|
||||
|
@ -1300,15 +1312,15 @@
|
|||
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
||||
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
||||
|
||||
"@webrecorder/wabac@^2.19.7":
|
||||
version "2.19.7"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.7.tgz#3afe48f79752bcd189cffd5d5e6a8dbe4f394053"
|
||||
integrity sha512-X9UFxWCww1KWDnAaEjg7vpg6SznBov5a88FPxbOvo5yCT/UkJcQHaa0qo1L52l46sIAUnSbsYz1ur9yMd6ygVA==
|
||||
"@webrecorder/wabac@^2.19.8":
|
||||
version "2.19.8"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.8.tgz#302ade200953a1c76f0b355983ae4081428fb933"
|
||||
integrity sha512-WjyfsGK8JWKeeDsGrOIT8ZLjMcOOAN93OMnRLO214jSV18SHEOY4JRvXzFOLF+OWYC5kJIMjl05gurTLq18jOA==
|
||||
dependencies:
|
||||
"@peculiar/asn1-ecc" "^2.3.4"
|
||||
"@peculiar/asn1-schema" "^2.3.3"
|
||||
"@peculiar/x509" "^1.9.2"
|
||||
"@webrecorder/wombat" "^3.7.14"
|
||||
"@webrecorder/wombat" "^3.8.0"
|
||||
acorn "^8.10.0"
|
||||
auto-js-ipfs "^2.1.1"
|
||||
base64-js "^1.5.1"
|
||||
|
@ -1327,14 +1339,14 @@
|
|||
path-parser "^6.1.0"
|
||||
process "^0.11.10"
|
||||
stream-browserify "^3.0.0"
|
||||
warcio "^2.2.1"
|
||||
warcio "^2.3.0"
|
||||
|
||||
"@webrecorder/wombat@^3.7.14":
|
||||
version "3.7.14"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.14.tgz#3779e4cadb256755bbbfd2960805965ec4daacd8"
|
||||
integrity sha512-sDNH+c8WstQrK91y8kIPJh1XAC2WXLU5rC8wztANzK1mVzA7v6XB5gk3Yp7OIAn4bn1XuGRVjubhKhmxVVZ9kg==
|
||||
"@webrecorder/wombat@^3.8.0":
|
||||
version "3.8.0"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.0.tgz#63ed3df199f11223b23c9ce66202590b8511ae2a"
|
||||
integrity sha512-MpzNu9+ClCHjOER9XCrsEIsJk15L6qGO+PxeBPiOtaFJmNUiz0auMT5AQwiPqJgKEAniZTlPx1O4kNCVJu9f2Q==
|
||||
dependencies:
|
||||
warcio "^2.2.0"
|
||||
warcio "^2.3.0"
|
||||
|
||||
"@zxing/text-encoding@0.9.0":
|
||||
version "0.9.0"
|
||||
|
@ -1380,6 +1392,11 @@ ansi-escapes@^4.2.1:
|
|||
dependencies:
|
||||
type-fest "^0.21.3"
|
||||
|
||||
ansi-regex@^4.1.0:
|
||||
version "4.1.1"
|
||||
resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-4.1.1.tgz#164daac87ab2d6f6db3a29875e2d1766582dabed"
|
||||
integrity sha512-ILlv4k/3f6vfQ4OoP2AGvirOktlQ98ZEL1k9FaQjxa3L1abBgbuTDAdPOpvbGncC0BTVQrl+OM8xZGK6tWXt7g==
|
||||
|
||||
ansi-regex@^5.0.0:
|
||||
version "5.0.0"
|
||||
resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.0.tgz#388539f55179bf39339c81af30a654d69f87cb75"
|
||||
|
@ -1820,6 +1837,11 @@ cjs-module-lexer@^1.0.0:
|
|||
resolved "https://registry.yarnpkg.com/cjs-module-lexer/-/cjs-module-lexer-1.2.2.tgz#9f84ba3244a512f3a54e5277e8eef4c489864e40"
|
||||
integrity sha512-cOU9usZw8/dXIXKtwa8pM0OTJQuJkxMN6w30csNRUerHfeQ5R6U3kkU/FtJeIf3M202OHfY2U8ccInBG7/xogA==
|
||||
|
||||
client-zip@^2.4.5:
|
||||
version "2.4.5"
|
||||
resolved "https://registry.yarnpkg.com/client-zip/-/client-zip-2.4.5.tgz#c9b6190abca57b8b4d6dcfd21c3a1f4d4ab3bc68"
|
||||
integrity sha512-4y4d5ZeTH/szIAMQeC8ju67pxtvj+3u20wMGwOFrZk+pegy3aSEA2JkwgC8XVDTXP/Iqn1gyqNQXmkyBp4KLEQ==
|
||||
|
||||
cliui@^8.0.1:
|
||||
version "8.0.1"
|
||||
resolved "https://registry.yarnpkg.com/cliui/-/cliui-8.0.1.tgz#0c04b075db02cbfe60dc8e6cf2f5486b1a3608aa"
|
||||
|
@ -4816,16 +4838,16 @@ string-length@^4.0.1:
|
|||
char-regex "^1.0.2"
|
||||
strip-ansi "^6.0.0"
|
||||
|
||||
string-width@^4.1.0, string-width@^4.2.0:
|
||||
version "4.2.2"
|
||||
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.2.tgz#dafd4f9559a7585cfba529c6a0a4f73488ebd4c5"
|
||||
integrity sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA==
|
||||
string-width@^4.1.0:
|
||||
version "4.1.0"
|
||||
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.1.0.tgz#ba846d1daa97c3c596155308063e075ed1c99aff"
|
||||
integrity sha512-NrX+1dVVh+6Y9dnQ19pR0pP4FiEIlUvdTGn8pw6CKTNq5sgib2nIhmUNT5TAmhWmvKr3WcxBcP3E8nWezuipuQ==
|
||||
dependencies:
|
||||
emoji-regex "^8.0.0"
|
||||
is-fullwidth-code-point "^3.0.0"
|
||||
strip-ansi "^6.0.0"
|
||||
strip-ansi "^5.2.0"
|
||||
|
||||
string-width@^4.2.3:
|
||||
string-width@^4.2.0, string-width@^4.2.3:
|
||||
version "4.2.3"
|
||||
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
|
||||
integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
|
||||
|
@ -4870,6 +4892,13 @@ string_decoder@^1.1.1:
|
|||
dependencies:
|
||||
safe-buffer "~5.2.0"
|
||||
|
||||
strip-ansi@^5.2.0:
|
||||
version "5.2.0"
|
||||
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-5.2.0.tgz#8c9a536feb6afc962bdfa5b104a5091c1ad9c0ae"
|
||||
integrity sha512-DuRs1gKbBqsMKIZlrffwlug8MHkcnpjs5VPmL1PAh+mA30U0DTotfDZ0d2UUsXpPmPmMMJ6W773MaA3J+lbiWA==
|
||||
dependencies:
|
||||
ansi-regex "^4.1.0"
|
||||
|
||||
strip-ansi@^6.0.0:
|
||||
version "6.0.0"
|
||||
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.0.tgz#0b1571dd7669ccd4f3e06e14ef1eed26225ae532"
|
||||
|
@ -4878,6 +4907,7 @@ strip-ansi@^6.0.0:
|
|||
ansi-regex "^5.0.0"
|
||||
|
||||
strip-ansi@^6.0.1:
|
||||
name strip-ansi-cjs
|
||||
version "6.0.1"
|
||||
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
|
||||
integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
|
||||
|
@ -5247,11 +5277,13 @@ walker@^1.0.8:
|
|||
dependencies:
|
||||
makeerror "1.0.12"
|
||||
|
||||
warcio@^2.2.0, warcio@^2.2.1:
|
||||
version "2.2.1"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.2.1.tgz#3619728fde716291c9b364744c276362a94bacec"
|
||||
integrity sha512-KPLoz3aFtdTjexG+QQaubMyuLiNANzvcadGMyNKdpcmhl0k6lBHQQVpxZw3Hx9+4pbyqDXyiF4cr/h2tS8kvcw==
|
||||
warcio@^2.3.0:
|
||||
version "2.3.0"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.0.tgz#a655df9b5986a53e5d05aa68cda51bfefdfa8347"
|
||||
integrity sha512-PCHcZ/fDE5+QECOFe/n/vzyDmAITJ1mvLx1jVONJ0uaV9OwcTbIWoh7Z0+OQwQdq8Wr1Nnb2hwhtHJ7J+9rHIQ==
|
||||
dependencies:
|
||||
"@types/pako" "^1.0.7"
|
||||
"@types/stream-buffers" "^3.0.7"
|
||||
base32-encode "^2.0.0"
|
||||
hash-wasm "^4.9.0"
|
||||
pako "^1.0.11"
|
||||
|
@ -5305,7 +5337,7 @@ which@^2.0.1:
|
|||
dependencies:
|
||||
isexe "^2.0.0"
|
||||
|
||||
wrap-ansi@^7.0.0:
|
||||
wrap-ansi@7.0.0, wrap-ansi@^7.0.0:
|
||||
version "7.0.0"
|
||||
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
|
||||
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue