Add WARC-Protocol header (#715)

- add WARC-Protocol repeated header(s) for HTTP, TLS as per iipc/warc-specifications#42
- also set HTTP/1.0 on WARC record if actually http/1.0, otherwise keep HTTP/1.1

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-05-19 18:59:52 -07:00 committed by GitHub
parent 71de8d6582
commit e72b34318d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 36 additions and 24 deletions

View file

@ -17,7 +17,7 @@ import {
rewriteHLS,
} from "@webrecorder/wabac";
import { WARCRecord } from "warcio";
import { WARCRecord, multiValueHeader } from "warcio";
import { TempFileBuffer, WARCSerializer } from "warcio/node";
import { WARCWriter } from "./warcwriter.js";
import { RedisCrawlState, WorkerId } from "./state.js";
@ -1880,7 +1880,7 @@ function createResponse(
const url = reqresp.url;
const warcVersion = "WARC/1.1";
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
const statusline = `${reqresp.httpProtocol} ${reqresp.status} ${reqresp.statusText}`;
const date = new Date(reqresp.ts).toISOString();
if (!reqresp.payload) {
@ -1893,6 +1893,13 @@ function createResponse(
"WARC-Page-ID": pageid,
};
if (reqresp.protocols.length) {
warcHeaders["WARC-Protocol"] = multiValueHeader(
"WARC-Protocol",
reqresp.protocols,
);
}
if (reqresp.resourceType) {
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
}
@ -1932,7 +1939,9 @@ function createRequest(
const urlParsed = new URL(url);
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
const statusline = `${method} ${url.slice(urlParsed.origin.length)} ${
reqresp.httpProtocol
}`;
const requestBody = reqresp.postData
? [encoder.encode(reqresp.postData)]

View file

@ -25,7 +25,10 @@ export class RequestResponseInfo {
method?: string;
url!: string;
protocol?: string = "HTTP/1.1";
// protocol for WARC record
httpProtocol = "HTTP/1.1";
protocols: string[] = [];
mimeType?: string;
@ -132,7 +135,12 @@ export class RequestResponseInfo {
this.setStatus(response.status, response.statusText);
this.protocol = response.protocol;
if (response.protocol) {
if (response.protocol === "http/1.0") {
this.httpProtocol = "HTTP/1.0";
}
this.protocols.push(response.protocol);
}
if (resourceType) {
this.resourceType = resourceType.toLowerCase();
@ -153,11 +161,16 @@ export class RequestResponseInfo {
this.fromServiceWorker = !!response.fromServiceWorker;
if (response.securityDetails) {
const issuer: string = response.securityDetails.issuer || "";
const { securityDetails } = response;
if (securityDetails) {
const securityProtocol = securityDetails.protocol
.replaceAll(" ", "/")
.toLowerCase();
this.protocols.push(securityProtocol);
const issuer: string = securityDetails.issuer || "";
const ctc: string =
response.securityDetails.certificateTransparencyCompliance ===
"compliant"
securityDetails.certificateTransparencyCompliance === "compliant"
? "1"
: "0";
this.extraOpts.cert = { issuer, ctc };
@ -204,21 +217,6 @@ export class RequestResponseInfo {
this.requestHeaders = params.headers;
}
getResponseHeadersText() {
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
if (this.responseHeaders) {
for (const header of Object.keys(this.responseHeaders)) {
headers += `${header}: ${this.responseHeaders[header].replace(
/\n/g,
", ",
)}\r\n`;
}
}
headers += "\r\n";
return headers;
}
hasRequest() {
return this.method && (this.requestHeaders || this.requestHeadersText);
}

View file

@ -24,6 +24,11 @@ test("run warc and ensure pageinfo records contain the correct resources", async
let foundInvalid = false;
for await (const record of parser) {
if (record.warcType === "response" &&
(record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) {
expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3");
}
if (
!foundIndex &&
record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"