mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add WARC-Protocol header (#715)
- add WARC-Protocol repeated header(s) for HTTP, TLS as per iipc/warc-specifications#42 - also set HTTP/1.0 on WARC record if actually http/1.0, otherwise keep HTTP/1.1 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
71de8d6582
commit
e72b34318d
3 changed files with 36 additions and 24 deletions
|
@ -17,7 +17,7 @@ import {
|
|||
rewriteHLS,
|
||||
} from "@webrecorder/wabac";
|
||||
|
||||
import { WARCRecord } from "warcio";
|
||||
import { WARCRecord, multiValueHeader } from "warcio";
|
||||
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
||||
import { WARCWriter } from "./warcwriter.js";
|
||||
import { RedisCrawlState, WorkerId } from "./state.js";
|
||||
|
@ -1880,7 +1880,7 @@ function createResponse(
|
|||
|
||||
const url = reqresp.url;
|
||||
const warcVersion = "WARC/1.1";
|
||||
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
||||
const statusline = `${reqresp.httpProtocol} ${reqresp.status} ${reqresp.statusText}`;
|
||||
const date = new Date(reqresp.ts).toISOString();
|
||||
|
||||
if (!reqresp.payload) {
|
||||
|
@ -1893,6 +1893,13 @@ function createResponse(
|
|||
"WARC-Page-ID": pageid,
|
||||
};
|
||||
|
||||
if (reqresp.protocols.length) {
|
||||
warcHeaders["WARC-Protocol"] = multiValueHeader(
|
||||
"WARC-Protocol",
|
||||
reqresp.protocols,
|
||||
);
|
||||
}
|
||||
|
||||
if (reqresp.resourceType) {
|
||||
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
||||
}
|
||||
|
@ -1932,7 +1939,9 @@ function createRequest(
|
|||
|
||||
const urlParsed = new URL(url);
|
||||
|
||||
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
|
||||
const statusline = `${method} ${url.slice(urlParsed.origin.length)} ${
|
||||
reqresp.httpProtocol
|
||||
}`;
|
||||
|
||||
const requestBody = reqresp.postData
|
||||
? [encoder.encode(reqresp.postData)]
|
||||
|
|
|
@ -25,7 +25,10 @@ export class RequestResponseInfo {
|
|||
|
||||
method?: string;
|
||||
url!: string;
|
||||
protocol?: string = "HTTP/1.1";
|
||||
|
||||
// protocol for WARC record
|
||||
httpProtocol = "HTTP/1.1";
|
||||
protocols: string[] = [];
|
||||
|
||||
mimeType?: string;
|
||||
|
||||
|
@ -132,7 +135,12 @@ export class RequestResponseInfo {
|
|||
|
||||
this.setStatus(response.status, response.statusText);
|
||||
|
||||
this.protocol = response.protocol;
|
||||
if (response.protocol) {
|
||||
if (response.protocol === "http/1.0") {
|
||||
this.httpProtocol = "HTTP/1.0";
|
||||
}
|
||||
this.protocols.push(response.protocol);
|
||||
}
|
||||
|
||||
if (resourceType) {
|
||||
this.resourceType = resourceType.toLowerCase();
|
||||
|
@ -153,11 +161,16 @@ export class RequestResponseInfo {
|
|||
|
||||
this.fromServiceWorker = !!response.fromServiceWorker;
|
||||
|
||||
if (response.securityDetails) {
|
||||
const issuer: string = response.securityDetails.issuer || "";
|
||||
const { securityDetails } = response;
|
||||
|
||||
if (securityDetails) {
|
||||
const securityProtocol = securityDetails.protocol
|
||||
.replaceAll(" ", "/")
|
||||
.toLowerCase();
|
||||
this.protocols.push(securityProtocol);
|
||||
const issuer: string = securityDetails.issuer || "";
|
||||
const ctc: string =
|
||||
response.securityDetails.certificateTransparencyCompliance ===
|
||||
"compliant"
|
||||
securityDetails.certificateTransparencyCompliance === "compliant"
|
||||
? "1"
|
||||
: "0";
|
||||
this.extraOpts.cert = { issuer, ctc };
|
||||
|
@ -204,21 +217,6 @@ export class RequestResponseInfo {
|
|||
this.requestHeaders = params.headers;
|
||||
}
|
||||
|
||||
getResponseHeadersText() {
|
||||
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
|
||||
|
||||
if (this.responseHeaders) {
|
||||
for (const header of Object.keys(this.responseHeaders)) {
|
||||
headers += `${header}: ${this.responseHeaders[header].replace(
|
||||
/\n/g,
|
||||
", ",
|
||||
)}\r\n`;
|
||||
}
|
||||
}
|
||||
headers += "\r\n";
|
||||
return headers;
|
||||
}
|
||||
|
||||
hasRequest() {
|
||||
return this.method && (this.requestHeaders || this.requestHeadersText);
|
||||
}
|
||||
|
|
|
@ -24,6 +24,11 @@ test("run warc and ensure pageinfo records contain the correct resources", async
|
|||
let foundInvalid = false;
|
||||
|
||||
for await (const record of parser) {
|
||||
if (record.warcType === "response" &&
|
||||
(record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) {
|
||||
expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3");
|
||||
}
|
||||
|
||||
if (
|
||||
!foundIndex &&
|
||||
record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue