mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Always add warcinfo records to all WARCs (#556)
Fixes #553 Includes `warcinfo` records at the beginning of new WARCs, as well as the combined WARC. Makes the warcinfo record also WARC/1.1 to match the rest of the WARC records.
This commit is contained in:
parent
894681e5fc
commit
089d901b9b
3 changed files with 70 additions and 25 deletions
|
@ -16,8 +16,6 @@ import { parseArgs } from "./util/argParser.js";
|
||||||
|
|
||||||
import yaml from "js-yaml";
|
import yaml from "js-yaml";
|
||||||
|
|
||||||
import * as warcio from "warcio";
|
|
||||||
|
|
||||||
import { HealthChecker } from "./util/healthcheck.js";
|
import { HealthChecker } from "./util/healthcheck.js";
|
||||||
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
||||||
import {
|
import {
|
||||||
|
@ -60,7 +58,7 @@ import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
|
||||||
import { Recorder } from "./util/recorder.js";
|
import { Recorder } from "./util/recorder.js";
|
||||||
import { SitemapReader } from "./util/sitemapper.js";
|
import { SitemapReader } from "./util/sitemapper.js";
|
||||||
import { ScopedSeed } from "./util/seeds.js";
|
import { ScopedSeed } from "./util/seeds.js";
|
||||||
import { WARCWriter } from "./util/warcwriter.js";
|
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
||||||
|
|
||||||
const HTTPS_AGENT = new HTTPSAgent({
|
const HTTPS_AGENT = new HTTPSAgent({
|
||||||
rejectUnauthorized: false,
|
rejectUnauthorized: false,
|
||||||
|
@ -458,6 +456,7 @@ export class Crawler {
|
||||||
logger.setExternalLogStream(this.logFH);
|
logger.setExternalLogStream(this.logFH);
|
||||||
|
|
||||||
this.infoString = await getInfoString();
|
this.infoString = await getInfoString();
|
||||||
|
setWARCInfo(this.infoString, this.params.warcInfo);
|
||||||
logger.info(this.infoString);
|
logger.info(this.infoString);
|
||||||
|
|
||||||
logger.info("Seeds", this.params.scopedSeeds);
|
logger.info("Seeds", this.params.scopedSeeds);
|
||||||
|
@ -1104,26 +1103,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
return res ? frame : null;
|
return res ? frame : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async createWARCInfo(filename: string) {
|
|
||||||
const warcVersion = "WARC/1.1";
|
|
||||||
const type = "warcinfo";
|
|
||||||
|
|
||||||
const info = {
|
|
||||||
software: this.infoString,
|
|
||||||
format: "WARC File Format 1.1",
|
|
||||||
};
|
|
||||||
|
|
||||||
const warcInfo = { ...info, ...this.params.warcInfo };
|
|
||||||
const record = await warcio.WARCRecord.createWARCInfo(
|
|
||||||
{ filename, type, warcVersion },
|
|
||||||
warcInfo,
|
|
||||||
);
|
|
||||||
const buffer = await warcio.WARCSerializer.serialize(record, {
|
|
||||||
gzip: true,
|
|
||||||
});
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
async checkLimits() {
|
async checkLimits() {
|
||||||
let interrupt = false;
|
let interrupt = false;
|
||||||
|
|
||||||
|
@ -2412,7 +2391,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
generatedCombinedWarcs.push(combinedWarcName);
|
generatedCombinedWarcs.push(combinedWarcName);
|
||||||
|
|
||||||
const warcBuffer = await this.createWARCInfo(combinedWarcName);
|
const warcBuffer = await createWARCInfo(combinedWarcName);
|
||||||
fh.write(warcBuffer);
|
fh.write(warcBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,8 @@ import PQueue from "p-queue";
|
||||||
|
|
||||||
const DEFAULT_ROLLOVER_SIZE = 1_000_000_000;
|
const DEFAULT_ROLLOVER_SIZE = 1_000_000_000;
|
||||||
|
|
||||||
|
let warcInfo = {};
|
||||||
|
|
||||||
export type ResourceRecordData = {
|
export type ResourceRecordData = {
|
||||||
buffer: Uint8Array;
|
buffer: Uint8Array;
|
||||||
resourceType: string;
|
resourceType: string;
|
||||||
|
@ -117,6 +119,8 @@ export class WARCWriter implements IndexerOffsetLength {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fh.write(await createWARCInfo(this.filename));
|
||||||
|
|
||||||
return fh;
|
return fh;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -310,6 +314,33 @@ export class WARCWriter implements IndexerOffsetLength {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =================================================================
|
||||||
|
export function setWARCInfo(
|
||||||
|
software: string,
|
||||||
|
otherParams?: Record<string, string>,
|
||||||
|
) {
|
||||||
|
warcInfo = {
|
||||||
|
software,
|
||||||
|
format: "WARC File Format 1.1",
|
||||||
|
...otherParams,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================
|
||||||
|
export async function createWARCInfo(filename: string) {
|
||||||
|
const warcVersion = "WARC/1.1";
|
||||||
|
const type = "warcinfo";
|
||||||
|
|
||||||
|
const record = await WARCRecord.createWARCInfo(
|
||||||
|
{ filename, type, warcVersion },
|
||||||
|
warcInfo,
|
||||||
|
);
|
||||||
|
const buffer = await WARCSerializer.serialize(record, {
|
||||||
|
gzip: true,
|
||||||
|
});
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
export function streamFinish(fh: Writable) {
|
export function streamFinish(fh: Writable) {
|
||||||
const p = new Promise<void>((resolve) => {
|
const p = new Promise<void>((resolve) => {
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
import zlib from "zlib";
|
import zlib from "zlib";
|
||||||
|
import path from "path";
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
|
||||||
test("check that the warcinfo file works as expected on the command line", async () => {
|
test("run crawl", async() => {
|
||||||
|
let success = false;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||||
const proc = child_process.execSync(
|
const proc = child_process.execSync(
|
||||||
|
@ -11,10 +14,42 @@ test("check that the warcinfo file works as expected on the command line", async
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log(proc);
|
console.log(proc);
|
||||||
|
success = true;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
expect(success).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check that the warcinfo for individual WARC is as expected", async () => {
|
||||||
|
|
||||||
|
const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/");
|
||||||
|
|
||||||
|
let filename = "";
|
||||||
|
|
||||||
|
for (const name of warcs) {
|
||||||
|
if (name.startsWith("rec-")) {
|
||||||
|
filename = path.join("test-crawls/collections/warcinfo/archive/", name);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const warcData = fs.readFileSync(filename);
|
||||||
|
|
||||||
|
const data = zlib.gunzipSync(warcData);
|
||||||
|
|
||||||
|
const string = data.toString("utf8");
|
||||||
|
|
||||||
|
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
||||||
|
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
||||||
|
expect(
|
||||||
|
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
|
||||||
|
).not.toEqual(null);
|
||||||
|
expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check that the warcinfo for combined WARC file is as expected", async () => {
|
||||||
const warcData = fs.readFileSync(
|
const warcData = fs.readFileSync(
|
||||||
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
|
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
|
||||||
);
|
);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue