mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
warc writing:
- update to warcio 2.4.6, write WARC-Payload-Digest along with WARC-Block-Digest for revisists - copy additional custom WARC headers to revisit from response
This commit is contained in:
parent
87c94876f6
commit
bbe084daa0
1 changed files with 23 additions and 10 deletions
|
|
@ -2074,14 +2074,14 @@ function createResponse(
|
||||||
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!contentIter) {
|
|
||||||
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Object.keys(reqresp.extraOpts).length) {
|
if (Object.keys(reqresp.extraOpts).length) {
|
||||||
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
|
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!contentIter) {
|
||||||
|
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
|
||||||
|
}
|
||||||
|
|
||||||
return WARCRecord.create(
|
return WARCRecord.create(
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
|
|
@ -2096,6 +2096,14 @@ function createResponse(
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =================================================================
|
||||||
|
const REVISIT_COPY_HEADERS = [
|
||||||
|
"WARC-Page-ID",
|
||||||
|
"WARC-Protocol",
|
||||||
|
"WARC-Resource-Type",
|
||||||
|
"WARC-JSON-Metadata",
|
||||||
|
];
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
// revisit
|
// revisit
|
||||||
async function createRevisitForResponse(
|
async function createRevisitForResponse(
|
||||||
|
|
@ -2104,12 +2112,17 @@ async function createRevisitForResponse(
|
||||||
refersToUrl: string,
|
refersToUrl: string,
|
||||||
refersToDate: string,
|
refersToDate: string,
|
||||||
) {
|
) {
|
||||||
const origPayloadDigest = responseRecord.warcPayloadDigest;
|
const payloadDigestForRevisit = responseRecord.warcPayloadDigest || "";
|
||||||
|
|
||||||
const warcHeaders: Record<string, string> = {
|
const warcHeaders: Record<string, string> = {};
|
||||||
"WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!,
|
|
||||||
"WARC-Payload-Digest": origPayloadDigest!,
|
const origWarcHeaders = responseRecord.warcHeaders.headers;
|
||||||
};
|
|
||||||
|
for (const header in REVISIT_COPY_HEADERS) {
|
||||||
|
if (origWarcHeaders.has(header)) {
|
||||||
|
warcHeaders[header] = origWarcHeaders.get(header)!;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const revisitRecord = WARCRecord.create({
|
const revisitRecord = WARCRecord.create({
|
||||||
url: responseRecord.warcTargetURI!,
|
url: responseRecord.warcTargetURI!,
|
||||||
|
|
@ -2127,7 +2140,7 @@ async function createRevisitForResponse(
|
||||||
maxMemSize: MAX_BROWSER_DEFAULT_FETCH_SIZE,
|
maxMemSize: MAX_BROWSER_DEFAULT_FETCH_SIZE,
|
||||||
});
|
});
|
||||||
|
|
||||||
await serializer.digestRecord();
|
await serializer.digestRecord({ payloadDigestForRevisit });
|
||||||
|
|
||||||
return { serializer, responseRecord: revisitRecord };
|
return { serializer, responseRecord: revisitRecord };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue