browsertrix-crawler/util/reqresp.js
Ilya Kreymer 877d9f5b44
Use new browser-based archiving mechanism instead of pywb proxy (#424)
Major refactoring of Browsertrix Crawler to native capture network traffic to WARC files
via the Chrome Debug Protocol (CDP). Allows for more flexibility and accuracy when dealing
with HTTP/2.x sites and avoids a MITM proxy. Addresses #343 

Changes include:
- Recorder class for capture CDP network traffic for each page.
- Handling requests from service workers via matching active frames, skipping unrelated requests outside the page (from background pages, etc..)
- WARC writing support via TS-based warcio.js library.
- Generates single WARC file per worker (still need to add size rollover).
- Request interception via Fetch.requestPaused
- Rule-based rewriting response support (via wabac.js), using Fetch.getResponseBody() / Fetch.fulfillRequest()
- Streaming responses via three methods: inline response fetch via Fetch.takeResponseBodyAsStream, 
async loading via browser network stack with Network.loadNetworkResource() and node-based async fetch
via fetch()
- Direct async fetch() capture of non-HTML URLs
- Awaiting for all requests to finish before moving on to next page, upto page timeout.
- Experimental: generate CDXJ on-the-fly as WARC is being written (not yet in use).
- removed pywb, using cdxj-indexer for --generateCDX option.
2023-11-07 21:38:50 -08:00

239 lines
6 KiB
JavaScript

import { getStatusText } from "@webrecorder/wabac/src/utils.js";
const CONTENT_LENGTH = "content-length";
const CONTENT_TYPE = "content-type";
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
// ===========================================================================
export class RequestResponseInfo
{
constructor(requestId) {
this._created = new Date();
this.requestId = requestId;
this.ts = null;
// request data
this.method = null;
this.url = null;
this.protocol = "HTTP/1.1";
this.requestHeaders = null;
this.requestHeadersText = null;
this.postData = null;
this.hasPostData = false;
// response data
this.status = 0;
this.statusText = null;
this.responseHeaders = null;
this.responseHeadersList = null;
this.responseHeadersText = null;
this.payload = null;
this.fromServiceWorker = false;
this.fetch = false;
this.resourceType = null;
this.extraOpts = {};
this.readSize = 0;
this.expectedSize = 0;
// set to true to indicate async loading in progress
this.asyncLoading = false;
// set to add truncated message
this.truncated = null;
}
fillRequest(params) {
this.url = params.request.url;
this.method = params.request.method;
if (!this.requestHeaders) {
this.requestHeaders = params.request.headers;
}
this.postData = params.request.postData;
this.hasPostData = params.request.hasPostData;
if (params.type) {
this.resourceType = params.type;
}
}
fillFetchRequestPaused(params) {
this.fillRequest(params);
this.status = params.responseStatusCode;
this.statusText = params.responseStatusText || getStatusText(this.status);
this.responseHeadersList = params.responseHeaders;
this.fetch = true;
this.resourceType = params.resourceType;
this.frameId = params.frameId;
}
fillResponse(response) {
// if initial fetch was a 200, but now replacing with 304, don't!
if (response.status == 304 && this.status && this.status != 304 && this.url) {
return;
}
this.url = response.url.split("#")[0];
this.status = response.status;
this.statusText = response.statusText || getStatusText(this.status);
this.protocol = response.protocol;
if (response.requestHeaders) {
this.requestHeaders = response.requestHeaders;
}
if (response.requestHeadersText) {
this.requestHeadersText = response.requestHeadersText;
}
this.responseHeaders = response.headers;
if (response.headersText) {
this.responseHeadersText = response.headersText;
}
this.fromServiceWorker = !!response.fromServiceWorker;
if (response.securityDetails) {
const issuer = response.securityDetails.issuer || "";
const ctc = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
this.extraOpts.cert = {issuer, ctc};
}
}
isSelfRedirect() {
if (this.status < 300 || this.status >= 400 || this.status === 304) {
return false;
}
try {
const headers = new Headers(this.responseHeaders);
const redirUrl = new URL(headers.get("location"), this.url).href;
return this.url === redirUrl;
} catch (e) {
return false;
}
}
fillResponseReceivedExtraInfo(params) {
// this.responseHeaders = params.headers;
// if (params.headersText) {
// this.responseHeadersText = params.headersText;
// }
this.extraOpts.ipType = params.resourceIPAddressSpace;
}
fillFetchResponse(response) {
this.responseHeaders = Object.fromEntries(response.headers);
this.status = response.status;
this.statusText = response.statusText || getStatusText(this.status);
}
fillRequestExtraInfo(params) {
this.requestHeaders = params.headers;
}
getResponseHeadersText() {
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
for (const header of Object.keys(this.responseHeaders)) {
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
}
headers += "\r\n";
return headers;
}
hasRequest() {
return this.method && (this.requestHeaders || this.requestHeadersText);
}
getRequestHeadersDict() {
return this._getHeadersDict(this.requestHeaders, null);
}
getResponseHeadersDict(length) {
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
}
_getHeadersDict(headersDict, headersList, actualContentLength) {
if (!headersDict && headersList) {
headersDict = {};
for (const header of headersList) {
let headerName = header.name.toLowerCase();
if (EXCLUDE_HEADERS.includes(headerName)) {
headerName = "x-orig-" + headerName;
continue;
}
if (actualContentLength && headerName === CONTENT_LENGTH) {
headersDict[headerName] = "" + actualContentLength;
continue;
}
headersDict[headerName] = header.value.replace(/\n/g, ", ");
}
}
if (!headersDict) {
return {};
}
for (const key of Object.keys(headersDict)) {
if (key[0] === ":") {
delete headersDict[key];
continue;
}
const keyLower = key.toLowerCase();
if (EXCLUDE_HEADERS.includes(keyLower)) {
headersDict["x-orig-" + key] = headersDict[key];
delete headersDict[key];
continue;
}
if (actualContentLength && keyLower === CONTENT_LENGTH) {
headersDict[key] = "" + actualContentLength;
continue;
}
headersDict[key] = headersDict[key].replace(/\n/g, ", ");
}
return headersDict;
}
isValidBinary() {
if (!this.payload) {
return false;
}
const length = this.payload.length;
const headers = new Headers(this.getResponseHeadersDict());
const contentType = headers.get(CONTENT_TYPE);
const contentLength = headers.get(CONTENT_LENGTH);
if (Number(contentLength) !== length) {
return false;
}
if (contentType && contentType.startsWith("text/html")) {
return false;
}
return true;
}
}