mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add WARC-Protocol header (#715)
- add WARC-Protocol repeated header(s) for HTTP, TLS as per iipc/warc-specifications#42 - also set HTTP/1.0 on WARC record if actually http/1.0, otherwise keep HTTP/1.1 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
71de8d6582
commit
e72b34318d
3 changed files with 36 additions and 24 deletions
|
@ -17,7 +17,7 @@ import {
|
||||||
rewriteHLS,
|
rewriteHLS,
|
||||||
} from "@webrecorder/wabac";
|
} from "@webrecorder/wabac";
|
||||||
|
|
||||||
import { WARCRecord } from "warcio";
|
import { WARCRecord, multiValueHeader } from "warcio";
|
||||||
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
||||||
import { WARCWriter } from "./warcwriter.js";
|
import { WARCWriter } from "./warcwriter.js";
|
||||||
import { RedisCrawlState, WorkerId } from "./state.js";
|
import { RedisCrawlState, WorkerId } from "./state.js";
|
||||||
|
@ -1880,7 +1880,7 @@ function createResponse(
|
||||||
|
|
||||||
const url = reqresp.url;
|
const url = reqresp.url;
|
||||||
const warcVersion = "WARC/1.1";
|
const warcVersion = "WARC/1.1";
|
||||||
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
const statusline = `${reqresp.httpProtocol} ${reqresp.status} ${reqresp.statusText}`;
|
||||||
const date = new Date(reqresp.ts).toISOString();
|
const date = new Date(reqresp.ts).toISOString();
|
||||||
|
|
||||||
if (!reqresp.payload) {
|
if (!reqresp.payload) {
|
||||||
|
@ -1893,6 +1893,13 @@ function createResponse(
|
||||||
"WARC-Page-ID": pageid,
|
"WARC-Page-ID": pageid,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (reqresp.protocols.length) {
|
||||||
|
warcHeaders["WARC-Protocol"] = multiValueHeader(
|
||||||
|
"WARC-Protocol",
|
||||||
|
reqresp.protocols,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if (reqresp.resourceType) {
|
if (reqresp.resourceType) {
|
||||||
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
||||||
}
|
}
|
||||||
|
@ -1932,7 +1939,9 @@ function createRequest(
|
||||||
|
|
||||||
const urlParsed = new URL(url);
|
const urlParsed = new URL(url);
|
||||||
|
|
||||||
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
|
const statusline = `${method} ${url.slice(urlParsed.origin.length)} ${
|
||||||
|
reqresp.httpProtocol
|
||||||
|
}`;
|
||||||
|
|
||||||
const requestBody = reqresp.postData
|
const requestBody = reqresp.postData
|
||||||
? [encoder.encode(reqresp.postData)]
|
? [encoder.encode(reqresp.postData)]
|
||||||
|
|
|
@ -25,7 +25,10 @@ export class RequestResponseInfo {
|
||||||
|
|
||||||
method?: string;
|
method?: string;
|
||||||
url!: string;
|
url!: string;
|
||||||
protocol?: string = "HTTP/1.1";
|
|
||||||
|
// protocol for WARC record
|
||||||
|
httpProtocol = "HTTP/1.1";
|
||||||
|
protocols: string[] = [];
|
||||||
|
|
||||||
mimeType?: string;
|
mimeType?: string;
|
||||||
|
|
||||||
|
@ -132,7 +135,12 @@ export class RequestResponseInfo {
|
||||||
|
|
||||||
this.setStatus(response.status, response.statusText);
|
this.setStatus(response.status, response.statusText);
|
||||||
|
|
||||||
this.protocol = response.protocol;
|
if (response.protocol) {
|
||||||
|
if (response.protocol === "http/1.0") {
|
||||||
|
this.httpProtocol = "HTTP/1.0";
|
||||||
|
}
|
||||||
|
this.protocols.push(response.protocol);
|
||||||
|
}
|
||||||
|
|
||||||
if (resourceType) {
|
if (resourceType) {
|
||||||
this.resourceType = resourceType.toLowerCase();
|
this.resourceType = resourceType.toLowerCase();
|
||||||
|
@ -153,11 +161,16 @@ export class RequestResponseInfo {
|
||||||
|
|
||||||
this.fromServiceWorker = !!response.fromServiceWorker;
|
this.fromServiceWorker = !!response.fromServiceWorker;
|
||||||
|
|
||||||
if (response.securityDetails) {
|
const { securityDetails } = response;
|
||||||
const issuer: string = response.securityDetails.issuer || "";
|
|
||||||
|
if (securityDetails) {
|
||||||
|
const securityProtocol = securityDetails.protocol
|
||||||
|
.replaceAll(" ", "/")
|
||||||
|
.toLowerCase();
|
||||||
|
this.protocols.push(securityProtocol);
|
||||||
|
const issuer: string = securityDetails.issuer || "";
|
||||||
const ctc: string =
|
const ctc: string =
|
||||||
response.securityDetails.certificateTransparencyCompliance ===
|
securityDetails.certificateTransparencyCompliance === "compliant"
|
||||||
"compliant"
|
|
||||||
? "1"
|
? "1"
|
||||||
: "0";
|
: "0";
|
||||||
this.extraOpts.cert = { issuer, ctc };
|
this.extraOpts.cert = { issuer, ctc };
|
||||||
|
@ -204,21 +217,6 @@ export class RequestResponseInfo {
|
||||||
this.requestHeaders = params.headers;
|
this.requestHeaders = params.headers;
|
||||||
}
|
}
|
||||||
|
|
||||||
getResponseHeadersText() {
|
|
||||||
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
|
|
||||||
|
|
||||||
if (this.responseHeaders) {
|
|
||||||
for (const header of Object.keys(this.responseHeaders)) {
|
|
||||||
headers += `${header}: ${this.responseHeaders[header].replace(
|
|
||||||
/\n/g,
|
|
||||||
", ",
|
|
||||||
)}\r\n`;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
headers += "\r\n";
|
|
||||||
return headers;
|
|
||||||
}
|
|
||||||
|
|
||||||
hasRequest() {
|
hasRequest() {
|
||||||
return this.method && (this.requestHeaders || this.requestHeadersText);
|
return this.method && (this.requestHeaders || this.requestHeadersText);
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,11 @@ test("run warc and ensure pageinfo records contain the correct resources", async
|
||||||
let foundInvalid = false;
|
let foundInvalid = false;
|
||||||
|
|
||||||
for await (const record of parser) {
|
for await (const record of parser) {
|
||||||
|
if (record.warcType === "response" &&
|
||||||
|
(record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) {
|
||||||
|
expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3");
|
||||||
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
!foundIndex &&
|
!foundIndex &&
|
||||||
record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"
|
record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue