From 8eace5eef006c68c946909064db9fecbb982bc52 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 12 Apr 2024 13:59:21 -0700 Subject: [PATCH] for qa, pageinfo records are written to an info.warc.gz during crawl, they're added to the crawl WARC, this makes them also be written to an info.warc.gz --- src/crawler.ts | 3 +++ src/util/recorder.ts | 10 +++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/crawler.ts b/src/crawler.ts index 8229542c..df866792 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2513,10 +2513,13 @@ self.__bx_behaviors.selectMainBehavior(); id: id.toString(), }); + const infoWriter = this.createExtraResourceWarcWriter("info"); + const res = new Recorder({ workerid: id, crawler: this, writer, + infoWriter, tempdir: this.tempdir, }); diff --git a/src/util/recorder.ts b/src/util/recorder.ts index c19b5e52..17b26f13 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -113,6 +113,7 @@ export class Recorder { gzip = true; writer: WARCWriter; + infoWriter: WARCWriter; pageUrl!: string; pageid!: string; @@ -120,11 +121,13 @@ export class Recorder { constructor({ workerid, writer, + infoWriter, crawler, tempdir, }: { workerid: WorkerId; writer: WARCWriter; + infoWriter?: WARCWriter; crawler: Crawler; tempdir: string; }) { @@ -133,6 +136,7 @@ export class Recorder { this.crawlState = crawler.crawlState; this.writer = writer; + this.infoWriter = infoWriter || writer; this.tempdir = tempdir; @@ -725,7 +729,7 @@ export class Recorder { const url = this.pageUrl; - this.writer.writeNewResourceRecord( + this.infoWriter.writeNewResourceRecord( { buffer: new TextEncoder().encode(text), resourceType: "pageinfo", @@ -806,6 +810,10 @@ export class Recorder { logger.debug("Finishing WARC writing", this.logDetails, "recorder"); await this.writer.flush(); + + if (this.infoWriter !== this.writer) { + await this.infoWriter.flush(); + } } shouldSkip(