Add WARC-Protocol header (#715)

- add WARC-Protocol repeated header(s) for HTTP, TLS as per iipc/warc-specifications#42
- also set HTTP/1.0 on WARC record if actually http/1.0, otherwise keep HTTP/1.1

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-05-19 18:59:52 -07:00 committed by GitHub
parent 71de8d6582
commit e72b34318d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 36 additions and 24 deletions

View file

@ -17,7 +17,7 @@ import {
rewriteHLS, rewriteHLS,
} from "@webrecorder/wabac"; } from "@webrecorder/wabac";
import { WARCRecord } from "warcio"; import { WARCRecord, multiValueHeader } from "warcio";
import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { TempFileBuffer, WARCSerializer } from "warcio/node";
import { WARCWriter } from "./warcwriter.js"; import { WARCWriter } from "./warcwriter.js";
import { RedisCrawlState, WorkerId } from "./state.js"; import { RedisCrawlState, WorkerId } from "./state.js";
@ -1880,7 +1880,7 @@ function createResponse(
const url = reqresp.url; const url = reqresp.url;
const warcVersion = "WARC/1.1"; const warcVersion = "WARC/1.1";
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`; const statusline = `${reqresp.httpProtocol} ${reqresp.status} ${reqresp.statusText}`;
const date = new Date(reqresp.ts).toISOString(); const date = new Date(reqresp.ts).toISOString();
if (!reqresp.payload) { if (!reqresp.payload) {
@ -1893,6 +1893,13 @@ function createResponse(
"WARC-Page-ID": pageid, "WARC-Page-ID": pageid,
}; };
if (reqresp.protocols.length) {
warcHeaders["WARC-Protocol"] = multiValueHeader(
"WARC-Protocol",
reqresp.protocols,
);
}
if (reqresp.resourceType) { if (reqresp.resourceType) {
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType; warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
} }
@ -1932,7 +1939,9 @@ function createRequest(
const urlParsed = new URL(url); const urlParsed = new URL(url);
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`; const statusline = `${method} ${url.slice(urlParsed.origin.length)} ${
reqresp.httpProtocol
}`;
const requestBody = reqresp.postData const requestBody = reqresp.postData
? [encoder.encode(reqresp.postData)] ? [encoder.encode(reqresp.postData)]

View file

@ -25,7 +25,10 @@ export class RequestResponseInfo {
method?: string; method?: string;
url!: string; url!: string;
protocol?: string = "HTTP/1.1";
// protocol for WARC record
httpProtocol = "HTTP/1.1";
protocols: string[] = [];
mimeType?: string; mimeType?: string;
@ -132,7 +135,12 @@ export class RequestResponseInfo {
this.setStatus(response.status, response.statusText); this.setStatus(response.status, response.statusText);
this.protocol = response.protocol; if (response.protocol) {
if (response.protocol === "http/1.0") {
this.httpProtocol = "HTTP/1.0";
}
this.protocols.push(response.protocol);
}
if (resourceType) { if (resourceType) {
this.resourceType = resourceType.toLowerCase(); this.resourceType = resourceType.toLowerCase();
@ -153,11 +161,16 @@ export class RequestResponseInfo {
this.fromServiceWorker = !!response.fromServiceWorker; this.fromServiceWorker = !!response.fromServiceWorker;
if (response.securityDetails) { const { securityDetails } = response;
const issuer: string = response.securityDetails.issuer || "";
if (securityDetails) {
const securityProtocol = securityDetails.protocol
.replaceAll(" ", "/")
.toLowerCase();
this.protocols.push(securityProtocol);
const issuer: string = securityDetails.issuer || "";
const ctc: string = const ctc: string =
response.securityDetails.certificateTransparencyCompliance === securityDetails.certificateTransparencyCompliance === "compliant"
"compliant"
? "1" ? "1"
: "0"; : "0";
this.extraOpts.cert = { issuer, ctc }; this.extraOpts.cert = { issuer, ctc };
@ -204,21 +217,6 @@ export class RequestResponseInfo {
this.requestHeaders = params.headers; this.requestHeaders = params.headers;
} }
getResponseHeadersText() {
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
if (this.responseHeaders) {
for (const header of Object.keys(this.responseHeaders)) {
headers += `${header}: ${this.responseHeaders[header].replace(
/\n/g,
", ",
)}\r\n`;
}
}
headers += "\r\n";
return headers;
}
hasRequest() { hasRequest() {
return this.method && (this.requestHeaders || this.requestHeadersText); return this.method && (this.requestHeaders || this.requestHeadersText);
} }

View file

@ -24,6 +24,11 @@ test("run warc and ensure pageinfo records contain the correct resources", async
let foundInvalid = false; let foundInvalid = false;
for await (const record of parser) { for await (const record of parser) {
if (record.warcType === "response" &&
(record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) {
expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3");
}
if ( if (
!foundIndex && !foundIndex &&
record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/" record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"