warc writing:

- update to warcio 2.4.6, write WARC-Payload-Digest along with WARC-Block-Digest for revisists
- copy additional custom WARC headers to revisit from response
This commit is contained in:
Ilya Kreymer 2025-09-17 20:48:32 -07:00
parent 87c94876f6
commit bbe084daa0

View file

@ -2074,14 +2074,14 @@ function createResponse(
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
}
if (!contentIter) {
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
}
if (Object.keys(reqresp.extraOpts).length) {
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
}
if (!contentIter) {
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
}
return WARCRecord.create(
{
url,
@ -2096,6 +2096,14 @@ function createResponse(
);
}
// =================================================================
const REVISIT_COPY_HEADERS = [
"WARC-Page-ID",
"WARC-Protocol",
"WARC-Resource-Type",
"WARC-JSON-Metadata",
];
// =================================================================
// revisit
async function createRevisitForResponse(
@ -2104,12 +2112,17 @@ async function createRevisitForResponse(
refersToUrl: string,
refersToDate: string,
) {
const origPayloadDigest = responseRecord.warcPayloadDigest;
const payloadDigestForRevisit = responseRecord.warcPayloadDigest || "";
const warcHeaders: Record<string, string> = {
"WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!,
"WARC-Payload-Digest": origPayloadDigest!,
};
const warcHeaders: Record<string, string> = {};
const origWarcHeaders = responseRecord.warcHeaders.headers;
for (const header in REVISIT_COPY_HEADERS) {
if (origWarcHeaders.has(header)) {
warcHeaders[header] = origWarcHeaders.get(header)!;
}
}
const revisitRecord = WARCRecord.create({
url: responseRecord.warcTargetURI!,
@ -2127,7 +2140,7 @@ async function createRevisitForResponse(
maxMemSize: MAX_BROWSER_DEFAULT_FETCH_SIZE,
});
await serializer.digestRecord();
await serializer.digestRecord({ payloadDigestForRevisit });
return { serializer, responseRecord: revisitRecord };
}