mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
for qa, pageinfo records are written to an info.warc.gz
during crawl, they're added to the crawl WARC, this makes them also be written to an info.warc.gz
This commit is contained in:
parent
cc39febb4a
commit
8eace5eef0
2 changed files with 12 additions and 1 deletions
|
@ -2513,10 +2513,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
id: id.toString(),
|
||||
});
|
||||
|
||||
const infoWriter = this.createExtraResourceWarcWriter("info");
|
||||
|
||||
const res = new Recorder({
|
||||
workerid: id,
|
||||
crawler: this,
|
||||
writer,
|
||||
infoWriter,
|
||||
tempdir: this.tempdir,
|
||||
});
|
||||
|
||||
|
|
|
@ -113,6 +113,7 @@ export class Recorder {
|
|||
gzip = true;
|
||||
|
||||
writer: WARCWriter;
|
||||
infoWriter: WARCWriter;
|
||||
|
||||
pageUrl!: string;
|
||||
pageid!: string;
|
||||
|
@ -120,11 +121,13 @@ export class Recorder {
|
|||
constructor({
|
||||
workerid,
|
||||
writer,
|
||||
infoWriter,
|
||||
crawler,
|
||||
tempdir,
|
||||
}: {
|
||||
workerid: WorkerId;
|
||||
writer: WARCWriter;
|
||||
infoWriter?: WARCWriter;
|
||||
crawler: Crawler;
|
||||
tempdir: string;
|
||||
}) {
|
||||
|
@ -133,6 +136,7 @@ export class Recorder {
|
|||
this.crawlState = crawler.crawlState;
|
||||
|
||||
this.writer = writer;
|
||||
this.infoWriter = infoWriter || writer;
|
||||
|
||||
this.tempdir = tempdir;
|
||||
|
||||
|
@ -725,7 +729,7 @@ export class Recorder {
|
|||
|
||||
const url = this.pageUrl;
|
||||
|
||||
this.writer.writeNewResourceRecord(
|
||||
this.infoWriter.writeNewResourceRecord(
|
||||
{
|
||||
buffer: new TextEncoder().encode(text),
|
||||
resourceType: "pageinfo",
|
||||
|
@ -806,6 +810,10 @@ export class Recorder {
|
|||
logger.debug("Finishing WARC writing", this.logDetails, "recorder");
|
||||
|
||||
await this.writer.flush();
|
||||
|
||||
if (this.infoWriter !== this.writer) {
|
||||
await this.infoWriter.flush();
|
||||
}
|
||||
}
|
||||
|
||||
shouldSkip(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue