mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
warc writing:
- update to warcio 2.4.6, write WARC-Payload-Digest along with WARC-Block-Digest for revisists - copy additional custom WARC headers to revisit from response
This commit is contained in:
parent
87c94876f6
commit
bbe084daa0
1 changed files with 23 additions and 10 deletions
|
|
@ -2074,14 +2074,14 @@ function createResponse(
|
|||
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
||||
}
|
||||
|
||||
if (!contentIter) {
|
||||
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
|
||||
}
|
||||
|
||||
if (Object.keys(reqresp.extraOpts).length) {
|
||||
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
|
||||
}
|
||||
|
||||
if (!contentIter) {
|
||||
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
|
||||
}
|
||||
|
||||
return WARCRecord.create(
|
||||
{
|
||||
url,
|
||||
|
|
@ -2096,6 +2096,14 @@ function createResponse(
|
|||
);
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
const REVISIT_COPY_HEADERS = [
|
||||
"WARC-Page-ID",
|
||||
"WARC-Protocol",
|
||||
"WARC-Resource-Type",
|
||||
"WARC-JSON-Metadata",
|
||||
];
|
||||
|
||||
// =================================================================
|
||||
// revisit
|
||||
async function createRevisitForResponse(
|
||||
|
|
@ -2104,12 +2112,17 @@ async function createRevisitForResponse(
|
|||
refersToUrl: string,
|
||||
refersToDate: string,
|
||||
) {
|
||||
const origPayloadDigest = responseRecord.warcPayloadDigest;
|
||||
const payloadDigestForRevisit = responseRecord.warcPayloadDigest || "";
|
||||
|
||||
const warcHeaders: Record<string, string> = {
|
||||
"WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!,
|
||||
"WARC-Payload-Digest": origPayloadDigest!,
|
||||
};
|
||||
const warcHeaders: Record<string, string> = {};
|
||||
|
||||
const origWarcHeaders = responseRecord.warcHeaders.headers;
|
||||
|
||||
for (const header in REVISIT_COPY_HEADERS) {
|
||||
if (origWarcHeaders.has(header)) {
|
||||
warcHeaders[header] = origWarcHeaders.get(header)!;
|
||||
}
|
||||
}
|
||||
|
||||
const revisitRecord = WARCRecord.create({
|
||||
url: responseRecord.warcTargetURI!,
|
||||
|
|
@ -2127,7 +2140,7 @@ async function createRevisitForResponse(
|
|||
maxMemSize: MAX_BROWSER_DEFAULT_FETCH_SIZE,
|
||||
});
|
||||
|
||||
await serializer.digestRecord();
|
||||
await serializer.digestRecord({ payloadDigestForRevisit });
|
||||
|
||||
return { serializer, responseRecord: revisitRecord };
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue