Merge branch 'unify-warc-writer' into use-js-wacz

This commit is contained in:
Ilya Kreymer 2024-03-22 21:49:15 -07:00
commit 50a771cc68
8 changed files with 197 additions and 191 deletions

View file

@ -65,6 +65,7 @@ import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { Recorder } from "./util/recorder.js"; import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js"; import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js"; import { ScopedSeed } from "./util/seeds.js";
import { WARCWriter } from "./util/warcwriter.js";
const HTTPS_AGENT = new HTTPSAgent({ const HTTPS_AGENT = new HTTPSAgent({
rejectUnauthorized: false, rejectUnauthorized: false,
@ -155,6 +156,11 @@ export class Crawler {
otherPagesFile: string; otherPagesFile: string;
archivesDir: string; archivesDir: string;
tempdir: string;
tempCdxDir: string;
screenshotWriter: WARCWriter | null;
textWriter: WARCWriter | null;
blockRules: BlockRules | null; blockRules: BlockRules | null;
adBlockRules: AdBlockRules | null; adBlockRules: AdBlockRules | null;
@ -183,8 +189,6 @@ export class Crawler {
maxHeapUsed = 0; maxHeapUsed = 0;
maxHeapTotal = 0; maxHeapTotal = 0;
warcPrefix: string;
driver!: (opts: { driver!: (opts: {
page: Page; page: Page;
data: PageState; data: PageState;
@ -278,6 +282,11 @@ export class Crawler {
// archives dir // archives dir
this.archivesDir = path.join(this.collDir, "archive"); this.archivesDir = path.join(this.collDir, "archive");
this.tempdir = path.join(os.tmpdir(), "tmp-dl");
this.tempCdxDir = path.join(this.collDir, "tmp-cdx");
this.screenshotWriter = null;
this.textWriter = null;
this.blockRules = null; this.blockRules = null;
this.adBlockRules = null; this.adBlockRules = null;
@ -295,12 +304,6 @@ export class Crawler {
this.customBehaviors = ""; this.customBehaviors = "";
this.browser = new Browser(); this.browser = new Browser();
this.warcPrefix = process.env.WARC_PREFIX || this.params.warcPrefix || "";
if (this.warcPrefix) {
this.warcPrefix += "-" + this.crawlId + "-";
}
} }
protected parseArgs() { protected parseArgs() {
@ -454,14 +457,10 @@ export class Crawler {
subprocesses.push(this.launchRedis()); subprocesses.push(this.launchRedis());
//const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
//if (initRes.status) {
// logger.info("wb-manager init failed, collection likely already exists");
//}
await fsp.mkdir(this.logDir, { recursive: true }); await fsp.mkdir(this.logDir, { recursive: true });
await fsp.mkdir(this.archivesDir, { recursive: true }); await fsp.mkdir(this.archivesDir, { recursive: true });
await fsp.mkdir(this.tempdir, { recursive: true });
await fsp.mkdir(this.tempCdxDir, { recursive: true });
this.logFH = fs.createWriteStream(this.logFilename); this.logFH = fs.createWriteStream(this.logFilename);
logger.setExternalLogStream(this.logFH); logger.setExternalLogStream(this.logFH);
@ -521,6 +520,13 @@ export class Crawler {
{ detached: RUN_DETACHED }, { detached: RUN_DETACHED },
); );
} }
if (this.params.screenshot) {
this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots");
}
if (this.params.text) {
this.textWriter = this.createExtraResourceWarcWriter("text");
}
} }
extraChromeArgs() { extraChromeArgs() {
@ -819,16 +825,15 @@ self.__bx_behaviors.selectMainBehavior();
const logDetails = { page: url, workerid }; const logDetails = { page: url, workerid };
if (this.params.screenshot) { if (this.params.screenshot && this.screenshotWriter) {
if (!data.isHTMLPage) { if (!data.isHTMLPage) {
logger.debug("Skipping screenshots for non-HTML page", logDetails); logger.debug("Skipping screenshots for non-HTML page", logDetails);
} }
const screenshots = new Screenshots({ const screenshots = new Screenshots({
warcPrefix: this.warcPrefix,
browser: this.browser, browser: this.browser,
page, page,
url, url,
directory: this.archivesDir, writer: this.screenshotWriter,
}); });
if (this.params.screenshot.includes("view")) { if (this.params.screenshot.includes("view")) {
await screenshots.take("view", saveOutput ? data : null); await screenshots.take("view", saveOutput ? data : null);
@ -843,11 +848,10 @@ self.__bx_behaviors.selectMainBehavior();
let textextract = null; let textextract = null;
if (data.isHTMLPage) { if (data.isHTMLPage && this.textWriter) {
textextract = new TextExtractViaSnapshot(cdp, { textextract = new TextExtractViaSnapshot(cdp, {
warcPrefix: this.warcPrefix, writer: this.textWriter,
url, url,
directory: this.archivesDir,
skipDocs: this.skipTextDocs, skipDocs: this.skipTextDocs,
}); });
const { text } = await textextract.extractAndStoreText( const { text } = await textextract.extractAndStoreText(
@ -1303,6 +1307,8 @@ self.__bx_behaviors.selectMainBehavior();
await this.closePages(); await this.closePages();
await this.closeFiles();
await this.writeStats(); await this.writeStats();
// if crawl has been stopped, mark as final exit for post-crawl tasks // if crawl has been stopped, mark as final exit for post-crawl tasks
@ -1339,6 +1345,15 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
async closeFiles() {
if (this.textWriter) {
await this.textWriter.flush();
}
if (this.screenshotWriter) {
await this.screenshotWriter.flush();
}
}
protected async _addInitialSeeds() { protected async _addInitialSeeds() {
for (let i = 0; i < this.params.scopedSeeds.length; i++) { for (let i = 0; i < this.params.scopedSeeds.length; i++) {
const seed = this.params.scopedSeeds[i]; const seed = this.params.scopedSeeds[i];
@ -2385,15 +2400,54 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
getWarcPrefix(defaultValue = "") {
let warcPrefix =
process.env.WARC_PREFIX || this.params.warcPrefix || defaultValue;
if (warcPrefix) {
warcPrefix += "-" + this.crawlId + "-";
}
return warcPrefix;
}
createExtraResourceWarcWriter(resourceName: string, gzip = true) {
const filenameBase = `${this.getWarcPrefix()}${resourceName}`;
return this.createWarcWriter(filenameBase, gzip, { resourceName });
}
createWarcWriter(
filenameBase: string,
gzip: boolean,
logDetails: Record<string, string>,
) {
const filenameTemplate = `${filenameBase}.warc${gzip ? ".gz" : ""}`;
return new WARCWriter({
archivesDir: this.archivesDir,
tempCdxDir: this.tempCdxDir,
filenameTemplate,
rolloverSize: this.params.rolloverSize,
gzip,
logDetails,
});
}
createRecorder(id: number): Recorder | null { createRecorder(id: number): Recorder | null {
if (!this.recording) { if (!this.recording) {
return null; return null;
} }
const filenameBase = `${this.getWarcPrefix("rec")}$ts-${id}`;
const writer = this.createWarcWriter(filenameBase, true, { id: id + "" });
const res = new Recorder({ const res = new Recorder({
workerid: id, workerid: id,
collDir: this.collDir,
crawler: this, crawler: this,
writer,
tempdir: this.tempdir,
}); });
this.browser.recorders.push(res); this.browser.recorders.push(res);

View file

@ -16,7 +16,6 @@ import { ZipRangeReader } from "@webrecorder/wabac/src/wacz/ziprangereader.js";
import { createLoader } from "@webrecorder/wabac/src/blockloaders.js"; import { createLoader } from "@webrecorder/wabac/src/blockloaders.js";
import { AsyncIterReader } from "warcio"; import { AsyncIterReader } from "warcio";
import { WARCResourceWriter } from "./util/warcresourcewriter.js";
import { parseArgs } from "./util/argParser.js"; import { parseArgs } from "./util/argParser.js";
import { PNG } from "pngjs"; import { PNG } from "pngjs";
@ -25,6 +24,7 @@ import pixelmatch from "pixelmatch";
import levenshtein from "js-levenshtein"; import levenshtein from "js-levenshtein";
import { MAX_URL_LENGTH } from "./util/reqresp.js"; import { MAX_URL_LENGTH } from "./util/reqresp.js";
import { openAsBlob } from "fs"; import { openAsBlob } from "fs";
import { WARCWriter } from "./util/warcwriter.js";
// RWP Replay Prefix // RWP Replay Prefix
const REPLAY_PREFIX = "http://localhost:9990/replay/w/replay/"; const REPLAY_PREFIX = "http://localhost:9990/replay/w/replay/";
@ -67,6 +67,7 @@ export class ReplayCrawler extends Crawler {
qaSource: string; qaSource: string;
pageInfos: Map<Page, ReplayPageInfoRecord>; pageInfos: Map<Page, ReplayPageInfoRecord>;
infoWriter: WARCWriter | null;
reloadTimeouts: WeakMap<Page, NodeJS.Timeout>; reloadTimeouts: WeakMap<Page, NodeJS.Timeout>;
@ -98,6 +99,14 @@ export class ReplayCrawler extends Crawler {
this.params.serviceWorker = "enabled"; this.params.serviceWorker = "enabled";
this.reloadTimeouts = new WeakMap<Page, NodeJS.Timeout>(); this.reloadTimeouts = new WeakMap<Page, NodeJS.Timeout>();
this.infoWriter = null;
}
async bootstrap(): Promise<void> {
await super.bootstrap();
this.infoWriter = this.createExtraResourceWarcWriter("info");
} }
protected parseArgs() { protected parseArgs() {
@ -666,18 +675,13 @@ export class ReplayCrawler extends Crawler {
(state as ComparisonPageState).comparison = comparison; (state as ComparisonPageState).comparison = comparison;
} }
const writer = new WARCResourceWriter({ await this.infoWriter?.writeNewResourceRecord({
buffer: new TextEncoder().encode(JSON.stringify(pageInfo, null, 2)),
resourceType: "pageinfo",
contentType: "application/json",
url: pageInfo.url, url: pageInfo.url,
directory: this.archivesDir,
warcPrefix: this.warcPrefix,
date: new Date(),
warcName: "info.warc.gz",
}); });
await writer.writeBufferToWARC(
new TextEncoder().encode(JSON.stringify(pageInfo, null, 2)),
"pageinfo",
"application/json",
);
this.pageInfos.delete(page); this.pageInfos.delete(page);
} }
} }

View file

@ -1,6 +1,4 @@
import fs from "fs";
import path from "path"; import path from "path";
import os from "os";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
@ -24,7 +22,6 @@ import { WARCWriter } from "./warcwriter.js";
import { RedisCrawlState, WorkerId } from "./state.js"; import { RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core"; import { CDPSession, Protocol } from "puppeteer-core";
import { Crawler } from "../crawler.js"; import { Crawler } from "../crawler.js";
import { WARCResourceWriter } from "./warcresourcewriter.js";
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000; const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
const MAX_BROWSER_TEXT_FETCH_SIZE = 25_000_000; const MAX_BROWSER_TEXT_FETCH_SIZE = 25_000_000;
@ -70,7 +67,6 @@ export type PageInfoRecord = {
// ================================================================= // =================================================================
export class Recorder { export class Recorder {
workerid: WorkerId; workerid: WorkerId;
collDir: string;
crawler: Crawler; crawler: Crawler;
@ -94,9 +90,7 @@ export class Recorder {
allowFull206 = false; allowFull206 = false;
archivesDir: string;
tempdir: string; tempdir: string;
tempCdxDir: string;
gzip = true; gzip = true;
@ -107,46 +101,26 @@ export class Recorder {
constructor({ constructor({
workerid, workerid,
collDir, writer,
crawler, crawler,
tempdir,
}: { }: {
workerid: WorkerId; workerid: WorkerId;
collDir: string; writer: WARCWriter;
crawler: Crawler; crawler: Crawler;
tempdir: string;
}) { }) {
this.workerid = workerid; this.workerid = workerid;
this.crawler = crawler; this.crawler = crawler;
this.crawlState = crawler.crawlState; this.crawlState = crawler.crawlState;
this.writer = writer;
this.tempdir = tempdir;
this.warcQ = new PQueue({ concurrency: 1 }); this.warcQ = new PQueue({ concurrency: 1 });
this.fetcherQ = new PQueue({ concurrency: 1 }); this.fetcherQ = new PQueue({ concurrency: 1 });
this.collDir = collDir;
this.archivesDir = path.join(this.collDir, "archive");
this.tempdir = path.join(os.tmpdir(), "tmp-dl");
this.tempCdxDir = path.join(this.collDir, "tmp-cdx");
fs.mkdirSync(this.tempdir, { recursive: true });
fs.mkdirSync(this.archivesDir, { recursive: true });
fs.mkdirSync(this.tempCdxDir, { recursive: true });
const prefix =
process.env.WARC_PREFIX || crawler.params.warcPrefix || "rec";
const crawlId = process.env.CRAWL_ID || os.hostname();
const filenameTemplate = `${prefix}-${crawlId}-$ts-${this.workerid}.warc${
this.gzip ? ".gz" : ""
}`;
this.writer = new WARCWriter({
archivesDir: this.archivesDir,
tempCdxDir: this.tempCdxDir,
filenameTemplate,
rolloverSize: crawler.params.rolloverSize,
gzip: this.gzip,
logDetails: this.logDetails,
});
} }
async onCreatePage({ cdp }: { cdp: CDPSession }) { async onCreatePage({ cdp }: { cdp: CDPSession }) {
@ -733,18 +707,19 @@ export class Recorder {
} }
} }
async writePageInfoRecord() { writePageInfoRecord() {
const text = JSON.stringify(this.pageInfo, null, 2); const text = JSON.stringify(this.pageInfo, null, 2);
const resourceRecord = await WARCResourceWriter.createResourceRecord( const url = this.pageUrl;
new TextEncoder().encode(text),
"pageinfo",
"application/json",
this.pageUrl,
new Date(),
);
this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord)); this.warcQ.add(() =>
this.writer.writeNewResourceRecord({
buffer: new TextEncoder().encode(text),
resourceType: "pageinfo",
contentType: "application/json",
url,
}),
);
return this.pageInfo.ts; return this.pageInfo.ts;
} }
@ -796,6 +771,8 @@ export class Recorder {
} }
async onDone(timeout: number) { async onDone(timeout: number) {
console.log("ON DONE!!");
await this.crawlState.setStatus("pending-wait"); await this.crawlState.setStatus("pending-wait");
const finishFetch = async () => { const finishFetch = async () => {

View file

@ -1,10 +1,10 @@
import sharp from "sharp"; import sharp from "sharp";
import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger, formatErr } from "./logger.js"; import { logger, formatErr } from "./logger.js";
import { Browser } from "./browser.js"; import { Browser } from "./browser.js";
import { Page } from "puppeteer-core"; import { Page } from "puppeteer-core";
import { PageState } from "./state.js"; import { PageState } from "./state.js";
import { WARCWriter } from "./warcwriter.js";
// ============================================================================ // ============================================================================
@ -42,18 +42,20 @@ export type ScreenshotOpts = {
browser: Browser; browser: Browser;
page: Page; page: Page;
url: string; url: string;
directory: string; writer: WARCWriter;
warcPrefix: string;
}; };
export class Screenshots extends WARCResourceWriter { export class Screenshots {
browser: Browser; browser: Browser;
page: Page; page: Page;
url: string;
writer: WARCWriter;
constructor(opts: ScreenshotOpts) { constructor({ browser, page, writer, url }: ScreenshotOpts) {
super({ ...opts, warcName: "screenshots.warc.gz" }); this.browser = browser;
this.browser = opts.browser; this.page = page;
this.page = opts.page; this.url = url;
this.writer = writer;
} }
async take( async take(
@ -72,13 +74,14 @@ export class Screenshots extends WARCResourceWriter {
if (state && screenshotType === "view") { if (state && screenshotType === "view") {
state.screenshotView = screenshotBuffer; state.screenshotView = screenshotBuffer;
} }
await this.writeBufferToWARC( await this.writer.writeNewResourceRecord({
screenshotBuffer, buffer: screenshotBuffer,
screenshotType, resourceType: screenshotType,
"image/" + options.type, contentType: "image/" + options.type,
); url: this.url,
});
logger.info( logger.info(
`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`, `Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.writer.filename}`,
); );
} catch (e) { } catch (e) {
logger.error( logger.error(
@ -103,13 +106,14 @@ export class Screenshots extends WARCResourceWriter {
// 16:9 thumbnail // 16:9 thumbnail
.resize(640, 360) .resize(640, 360)
.toBuffer(); .toBuffer();
await this.writeBufferToWARC( await this.writer.writeNewResourceRecord({
thumbnailBuffer, buffer: thumbnailBuffer,
screenshotType, resourceType: screenshotType,
"image/" + options.type, contentType: "image/" + options.type,
); url: this.url,
});
logger.info( logger.info(
`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`, `Screenshot (type: thumbnail) for ${this.url} written to ${this.writer.filename}`,
); );
} catch (e) { } catch (e) {
logger.error( logger.error(

View file

@ -1,26 +1,28 @@
import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { CDPSession, Protocol } from "puppeteer-core"; import { CDPSession, Protocol } from "puppeteer-core";
import { WARCWriter } from "./warcwriter.js";
// ============================================================================ // ============================================================================
type TextExtractOpts = { type TextExtractOpts = {
url: string; url: string;
directory: string; writer: WARCWriter;
warcPrefix: string;
skipDocs: number; skipDocs: number;
}; };
// ============================================================================ // ============================================================================
export abstract class BaseTextExtract extends WARCResourceWriter { export abstract class BaseTextExtract {
cdp: CDPSession; cdp: CDPSession;
lastText: string | null = null; lastText: string | null = null;
text: string | null = null; text: string | null = null;
skipDocs: number = 0; skipDocs: number = 0;
writer: WARCWriter;
url: string;
constructor(cdp: CDPSession, opts: TextExtractOpts) { constructor(cdp: CDPSession, { writer, skipDocs, url }: TextExtractOpts) {
super({ ...opts, warcName: "text.warc.gz" }); this.writer = writer;
this.cdp = cdp; this.cdp = cdp;
this.skipDocs = opts.skipDocs || 0; this.url = url;
this.skipDocs = skipDocs || 0;
} }
async extractAndStoreText( async extractAndStoreText(
@ -41,13 +43,14 @@ export abstract class BaseTextExtract extends WARCResourceWriter {
return { changed: false, text }; return { changed: false, text };
} }
if (saveToWarc) { if (saveToWarc) {
await this.writeBufferToWARC( await this.writer.writeNewResourceRecord({
new TextEncoder().encode(text), buffer: new TextEncoder().encode(text),
resourceType, resourceType,
"text/plain", contentType: "text/plain",
); url: this.url,
});
logger.debug( logger.debug(
`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`, `Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.writer.filename}`,
); );
} }

View file

@ -1,78 +0,0 @@
import fs from "fs";
import path from "path";
import * as warcio from "warcio";
// ===========================================================================
export type WARCResourceWriterOpts = {
url: string;
directory: string;
date?: Date;
warcName: string;
warcPrefix: string;
};
// ===========================================================================
export class WARCResourceWriter {
url: string;
directory: string;
warcName: string;
date: Date;
constructor({
url,
directory,
date,
warcPrefix,
warcName,
}: WARCResourceWriterOpts) {
this.url = url;
this.directory = directory;
this.warcName = path.join(this.directory, warcPrefix + warcName);
this.date = date ? date : new Date();
}
async writeBufferToWARC(
contents: Uint8Array,
resourceType: string,
contentType: string,
) {
const warcRecord = await WARCResourceWriter.createResourceRecord(
contents,
resourceType,
contentType,
this.url,
this.date,
);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {
gzip: true,
});
fs.appendFileSync(this.warcName, warcRecordBuffer);
}
static async createResourceRecord(
buffer: Uint8Array,
resourceType: string,
contentType: string,
url: string,
date: Date,
) {
const warcVersion = "WARC/1.1";
const warcRecordType = "resource";
const warcHeaders = { "Content-Type": contentType };
async function* content() {
yield buffer;
}
const resourceUrl = `urn:${resourceType}:${url}`;
return warcio.WARCRecord.create(
{
url: resourceUrl,
date: date.toISOString(),
type: warcRecordType,
warcVersion,
warcHeaders,
},
content(),
);
}
}

View file

@ -2,14 +2,22 @@ import fs from "fs";
import { Writable } from "stream"; import { Writable } from "stream";
import path from "path"; import path from "path";
import { CDXIndexer } from "warcio"; import { CDXIndexer, WARCRecord } from "warcio";
import { WARCSerializer } from "warcio/node"; import { WARCSerializer } from "warcio/node";
import { logger, formatErr } from "./logger.js"; import { logger, formatErr } from "./logger.js";
import type { IndexerOffsetLength, WARCRecord } from "warcio"; import type { IndexerOffsetLength } from "warcio";
import { timestampNow } from "./timing.js"; import { timestampNow } from "./timing.js";
const DEFAULT_ROLLOVER_SIZE = 1_000_000_000; const DEFAULT_ROLLOVER_SIZE = 1_000_000_000;
export type ResourceRecordData = {
buffer: Uint8Array;
resourceType: string;
contentType: string;
url: string;
date?: Date;
};
// ================================================================= // =================================================================
export class WARCWriter implements IndexerOffsetLength { export class WARCWriter implements IndexerOffsetLength {
archivesDir: string; archivesDir: string;
@ -47,6 +55,8 @@ export class WARCWriter implements IndexerOffsetLength {
}) { }) {
this.archivesDir = archivesDir; this.archivesDir = archivesDir;
this.tempCdxDir = tempCdxDir; this.tempCdxDir = tempCdxDir;
// for now, disabling CDX
this.tempCdxDir = undefined;
this.logDetails = logDetails; this.logDetails = logDetails;
this.gzip = gzip; this.gzip = gzip;
this.rolloverSize = rolloverSize; this.rolloverSize = rolloverSize;
@ -137,6 +147,39 @@ export class WARCWriter implements IndexerOffsetLength {
this._writeCDX(record); this._writeCDX(record);
} }
async writeNewResourceRecord({
buffer,
resourceType,
contentType,
url,
date,
}: ResourceRecordData) {
const warcVersion = "WARC/1.1";
const warcRecordType = "resource";
const warcHeaders = { "Content-Type": contentType };
async function* content() {
yield buffer;
}
const resourceUrl = `urn:${resourceType}:${url}`;
if (!date) {
date = new Date();
}
return await this.writeSingleRecord(
WARCRecord.create(
{
url: resourceUrl,
date: date.toISOString(),
type: warcRecordType,
warcVersion,
warcHeaders,
},
content(),
),
);
}
private async _writeRecord(record: WARCRecord, serializer: WARCSerializer) { private async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
if (this.done) { if (this.done) {
logger.warn( logger.warn(
@ -188,8 +231,6 @@ export class WARCWriter implements IndexerOffsetLength {
} }
async flush() { async flush() {
this.done = true;
if (this.fh) { if (this.fh) {
await streamFinish(this.fh); await streamFinish(this.fh);
this.fh = null; this.fh = null;
@ -201,6 +242,8 @@ export class WARCWriter implements IndexerOffsetLength {
await streamFinish(this.cdxFH); await streamFinish(this.cdxFH);
this.cdxFH = null; this.cdxFH = null;
} }
this.done = true;
} }
} }

View file

@ -290,7 +290,7 @@ export class PageWorker {
} finally { } finally {
try { try {
if (this.recorder) { if (this.recorder) {
opts.data.ts = await this.recorder.writePageInfoRecord(); opts.data.ts = this.recorder.writePageInfoRecord();
} }
} catch (e) { } catch (e) {
logger.error( logger.error(
@ -403,7 +403,6 @@ export async function runWorkers(
) { ) {
logger.info(`Creating ${numWorkers} workers`, {}, "worker"); logger.info(`Creating ${numWorkers} workers`, {}, "worker");
const workers = [];
let offset = 0; let offset = 0;
// automatically set worker start by ordinal in k8s // automatically set worker start by ordinal in k8s