for qa, pageinfo records are written to an info.warc.gz

during crawl, they're added to the crawl WARC, this makes them also be written to an info.warc.gz
This commit is contained in:
Ilya Kreymer 2024-04-12 13:59:21 -07:00
parent cc39febb4a
commit 8eace5eef0
2 changed files with 12 additions and 1 deletions

View file

@ -2513,10 +2513,13 @@ self.__bx_behaviors.selectMainBehavior();
id: id.toString(),
});
const infoWriter = this.createExtraResourceWarcWriter("info");
const res = new Recorder({
workerid: id,
crawler: this,
writer,
infoWriter,
tempdir: this.tempdir,
});

View file

@ -113,6 +113,7 @@ export class Recorder {
gzip = true;
writer: WARCWriter;
infoWriter: WARCWriter;
pageUrl!: string;
pageid!: string;
@ -120,11 +121,13 @@ export class Recorder {
constructor({
workerid,
writer,
infoWriter,
crawler,
tempdir,
}: {
workerid: WorkerId;
writer: WARCWriter;
infoWriter?: WARCWriter;
crawler: Crawler;
tempdir: string;
}) {
@ -133,6 +136,7 @@ export class Recorder {
this.crawlState = crawler.crawlState;
this.writer = writer;
this.infoWriter = infoWriter || writer;
this.tempdir = tempdir;
@ -725,7 +729,7 @@ export class Recorder {
const url = this.pageUrl;
this.writer.writeNewResourceRecord(
this.infoWriter.writeNewResourceRecord(
{
buffer: new TextEncoder().encode(text),
resourceType: "pageinfo",
@ -806,6 +810,10 @@ export class Recorder {
logger.debug("Finishing WARC writing", this.logDetails, "recorder");
await this.writer.flush();
if (this.infoWriter !== this.writer) {
await this.infoWriter.flush();
}
}
shouldSkip(