mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

Major refactoring of Browsertrix Crawler to native capture network traffic to WARC files via the Chrome Debug Protocol (CDP). Allows for more flexibility and accuracy when dealing with HTTP/2.x sites and avoids a MITM proxy. Addresses #343 Changes include: - Recorder class for capture CDP network traffic for each page. - Handling requests from service workers via matching active frames, skipping unrelated requests outside the page (from background pages, etc..) - WARC writing support via TS-based warcio.js library. - Generates single WARC file per worker (still need to add size rollover). - Request interception via Fetch.requestPaused - Rule-based rewriting response support (via wabac.js), using Fetch.getResponseBody() / Fetch.fulfillRequest() - Streaming responses via three methods: inline response fetch via Fetch.takeResponseBodyAsStream, async loading via browser network stack with Network.loadNetworkResource() and node-based async fetch via fetch() - Direct async fetch() capture of non-HTML URLs - Awaiting for all requests to finish before moving on to next page, upto page timeout. - Experimental: generate CDXJ on-the-fly as WARC is being written (not yet in use). - removed pywb, using cdxj-indexer for --generateCDX option.
239 lines
6 KiB
JavaScript
239 lines
6 KiB
JavaScript
import { getStatusText } from "@webrecorder/wabac/src/utils.js";
|
|
|
|
const CONTENT_LENGTH = "content-length";
|
|
const CONTENT_TYPE = "content-type";
|
|
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
|
|
|
|
|
// ===========================================================================
|
|
export class RequestResponseInfo
|
|
{
|
|
constructor(requestId) {
|
|
this._created = new Date();
|
|
|
|
this.requestId = requestId;
|
|
|
|
this.ts = null;
|
|
|
|
// request data
|
|
this.method = null;
|
|
this.url = null;
|
|
this.protocol = "HTTP/1.1";
|
|
|
|
this.requestHeaders = null;
|
|
this.requestHeadersText = null;
|
|
|
|
this.postData = null;
|
|
this.hasPostData = false;
|
|
|
|
// response data
|
|
this.status = 0;
|
|
this.statusText = null;
|
|
|
|
this.responseHeaders = null;
|
|
this.responseHeadersList = null;
|
|
this.responseHeadersText = null;
|
|
|
|
this.payload = null;
|
|
|
|
this.fromServiceWorker = false;
|
|
|
|
this.fetch = false;
|
|
|
|
this.resourceType = null;
|
|
|
|
this.extraOpts = {};
|
|
|
|
this.readSize = 0;
|
|
this.expectedSize = 0;
|
|
|
|
// set to true to indicate async loading in progress
|
|
this.asyncLoading = false;
|
|
|
|
// set to add truncated message
|
|
this.truncated = null;
|
|
}
|
|
|
|
fillRequest(params) {
|
|
this.url = params.request.url;
|
|
this.method = params.request.method;
|
|
if (!this.requestHeaders) {
|
|
this.requestHeaders = params.request.headers;
|
|
}
|
|
this.postData = params.request.postData;
|
|
this.hasPostData = params.request.hasPostData;
|
|
|
|
if (params.type) {
|
|
this.resourceType = params.type;
|
|
}
|
|
|
|
}
|
|
|
|
fillFetchRequestPaused(params) {
|
|
this.fillRequest(params);
|
|
|
|
this.status = params.responseStatusCode;
|
|
this.statusText = params.responseStatusText || getStatusText(this.status);
|
|
|
|
this.responseHeadersList = params.responseHeaders;
|
|
|
|
this.fetch = true;
|
|
this.resourceType = params.resourceType;
|
|
|
|
this.frameId = params.frameId;
|
|
}
|
|
|
|
fillResponse(response) {
|
|
// if initial fetch was a 200, but now replacing with 304, don't!
|
|
if (response.status == 304 && this.status && this.status != 304 && this.url) {
|
|
return;
|
|
}
|
|
|
|
this.url = response.url.split("#")[0];
|
|
|
|
this.status = response.status;
|
|
this.statusText = response.statusText || getStatusText(this.status);
|
|
|
|
this.protocol = response.protocol;
|
|
|
|
if (response.requestHeaders) {
|
|
this.requestHeaders = response.requestHeaders;
|
|
}
|
|
if (response.requestHeadersText) {
|
|
this.requestHeadersText = response.requestHeadersText;
|
|
}
|
|
|
|
this.responseHeaders = response.headers;
|
|
|
|
if (response.headersText) {
|
|
this.responseHeadersText = response.headersText;
|
|
}
|
|
|
|
this.fromServiceWorker = !!response.fromServiceWorker;
|
|
|
|
if (response.securityDetails) {
|
|
const issuer = response.securityDetails.issuer || "";
|
|
const ctc = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
|
|
this.extraOpts.cert = {issuer, ctc};
|
|
}
|
|
}
|
|
|
|
isSelfRedirect() {
|
|
if (this.status < 300 || this.status >= 400 || this.status === 304) {
|
|
return false;
|
|
}
|
|
try {
|
|
const headers = new Headers(this.responseHeaders);
|
|
const redirUrl = new URL(headers.get("location"), this.url).href;
|
|
return this.url === redirUrl;
|
|
} catch (e) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
fillResponseReceivedExtraInfo(params) {
|
|
// this.responseHeaders = params.headers;
|
|
// if (params.headersText) {
|
|
// this.responseHeadersText = params.headersText;
|
|
// }
|
|
this.extraOpts.ipType = params.resourceIPAddressSpace;
|
|
}
|
|
|
|
fillFetchResponse(response) {
|
|
this.responseHeaders = Object.fromEntries(response.headers);
|
|
this.status = response.status;
|
|
this.statusText = response.statusText || getStatusText(this.status);
|
|
|
|
}
|
|
|
|
fillRequestExtraInfo(params) {
|
|
this.requestHeaders = params.headers;
|
|
}
|
|
|
|
getResponseHeadersText() {
|
|
let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`;
|
|
|
|
for (const header of Object.keys(this.responseHeaders)) {
|
|
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
|
|
}
|
|
headers += "\r\n";
|
|
return headers;
|
|
}
|
|
|
|
hasRequest() {
|
|
return this.method && (this.requestHeaders || this.requestHeadersText);
|
|
}
|
|
|
|
getRequestHeadersDict() {
|
|
return this._getHeadersDict(this.requestHeaders, null);
|
|
}
|
|
|
|
getResponseHeadersDict(length) {
|
|
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
|
|
}
|
|
|
|
_getHeadersDict(headersDict, headersList, actualContentLength) {
|
|
if (!headersDict && headersList) {
|
|
headersDict = {};
|
|
|
|
for (const header of headersList) {
|
|
let headerName = header.name.toLowerCase();
|
|
if (EXCLUDE_HEADERS.includes(headerName)) {
|
|
headerName = "x-orig-" + headerName;
|
|
continue;
|
|
}
|
|
if (actualContentLength && headerName === CONTENT_LENGTH) {
|
|
headersDict[headerName] = "" + actualContentLength;
|
|
continue;
|
|
}
|
|
headersDict[headerName] = header.value.replace(/\n/g, ", ");
|
|
}
|
|
}
|
|
|
|
if (!headersDict) {
|
|
return {};
|
|
}
|
|
|
|
for (const key of Object.keys(headersDict)) {
|
|
if (key[0] === ":") {
|
|
delete headersDict[key];
|
|
continue;
|
|
}
|
|
const keyLower = key.toLowerCase();
|
|
if (EXCLUDE_HEADERS.includes(keyLower)) {
|
|
headersDict["x-orig-" + key] = headersDict[key];
|
|
delete headersDict[key];
|
|
continue;
|
|
}
|
|
if (actualContentLength && keyLower === CONTENT_LENGTH) {
|
|
headersDict[key] = "" + actualContentLength;
|
|
continue;
|
|
}
|
|
headersDict[key] = headersDict[key].replace(/\n/g, ", ");
|
|
}
|
|
|
|
return headersDict;
|
|
}
|
|
|
|
isValidBinary() {
|
|
if (!this.payload) {
|
|
return false;
|
|
}
|
|
|
|
const length = this.payload.length;
|
|
|
|
const headers = new Headers(this.getResponseHeadersDict());
|
|
const contentType = headers.get(CONTENT_TYPE);
|
|
const contentLength = headers.get(CONTENT_LENGTH);
|
|
|
|
if (Number(contentLength) !== length) {
|
|
return false;
|
|
}
|
|
|
|
if (contentType && contentType.startsWith("text/html")) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|