diff --git a/src/util/recorder.ts b/src/util/recorder.ts index bbaffa43..168ab63f 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -17,7 +17,7 @@ import { rewriteHLS, } from "@webrecorder/wabac"; -import { WARCRecord } from "warcio"; +import { WARCRecord, multiValueHeader } from "warcio"; import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { WARCWriter } from "./warcwriter.js"; import { RedisCrawlState, WorkerId } from "./state.js"; @@ -1880,7 +1880,7 @@ function createResponse( const url = reqresp.url; const warcVersion = "WARC/1.1"; - const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`; + const statusline = `${reqresp.httpProtocol} ${reqresp.status} ${reqresp.statusText}`; const date = new Date(reqresp.ts).toISOString(); if (!reqresp.payload) { @@ -1893,6 +1893,13 @@ function createResponse( "WARC-Page-ID": pageid, }; + if (reqresp.protocols.length) { + warcHeaders["WARC-Protocol"] = multiValueHeader( + "WARC-Protocol", + reqresp.protocols, + ); + } + if (reqresp.resourceType) { warcHeaders["WARC-Resource-Type"] = reqresp.resourceType; } @@ -1932,7 +1939,9 @@ function createRequest( const urlParsed = new URL(url); - const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`; + const statusline = `${method} ${url.slice(urlParsed.origin.length)} ${ + reqresp.httpProtocol + }`; const requestBody = reqresp.postData ? [encoder.encode(reqresp.postData)] diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 8c738a33..b1efa240 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -25,7 +25,10 @@ export class RequestResponseInfo { method?: string; url!: string; - protocol?: string = "HTTP/1.1"; + + // protocol for WARC record + httpProtocol = "HTTP/1.1"; + protocols: string[] = []; mimeType?: string; @@ -132,7 +135,12 @@ export class RequestResponseInfo { this.setStatus(response.status, response.statusText); - this.protocol = response.protocol; + if (response.protocol) { + if (response.protocol === "http/1.0") { + this.httpProtocol = "HTTP/1.0"; + } + this.protocols.push(response.protocol); + } if (resourceType) { this.resourceType = resourceType.toLowerCase(); @@ -153,11 +161,16 @@ export class RequestResponseInfo { this.fromServiceWorker = !!response.fromServiceWorker; - if (response.securityDetails) { - const issuer: string = response.securityDetails.issuer || ""; + const { securityDetails } = response; + + if (securityDetails) { + const securityProtocol = securityDetails.protocol + .replaceAll(" ", "/") + .toLowerCase(); + this.protocols.push(securityProtocol); + const issuer: string = securityDetails.issuer || ""; const ctc: string = - response.securityDetails.certificateTransparencyCompliance === - "compliant" + securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0"; this.extraOpts.cert = { issuer, ctc }; @@ -204,21 +217,6 @@ export class RequestResponseInfo { this.requestHeaders = params.headers; } - getResponseHeadersText() { - let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`; - - if (this.responseHeaders) { - for (const header of Object.keys(this.responseHeaders)) { - headers += `${header}: ${this.responseHeaders[header].replace( - /\n/g, - ", ", - )}\r\n`; - } - } - headers += "\r\n"; - return headers; - } - hasRequest() { return this.method && (this.requestHeaders || this.requestHeadersText); } diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 01dc77a4..061dbe8c 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -24,6 +24,11 @@ test("run warc and ensure pageinfo records contain the correct resources", async let foundInvalid = false; for await (const record of parser) { + if (record.warcType === "response" && + (record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) { + expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3"); + } + if ( !foundIndex && record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"