recorder: don't do streaming fetch for unknown or large responses if content-type is

text/html, always need to load in browser to continue
type fixes: remove a few 'any' types in reqresp
This commit is contained in:
Ilya Kreymer 2023-11-09 08:55:42 -08:00
parent 5a6bef890f
commit e534f49e5e
2 changed files with 22 additions and 34 deletions

View file

@ -20,6 +20,7 @@ import { TempFileBuffer, WARCSerializer } from "warcio/node";
import { WARCWriter } from "./warcwriter.js";
import { RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core";
import { Crawler } from "../crawler.js";
const MAX_BROWSER_FETCH_SIZE = 2_000_000;
const MAX_NETWORK_LOAD_SIZE = 200_000_000;
@ -43,9 +44,8 @@ export class Recorder
{
workerid: WorkerId;
collDir: string;
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
crawler: any;
crawler: Crawler;
crawlState: RedisCrawlState;
@ -74,13 +74,14 @@ export class Recorder
writer: WARCWriter;
pageUrl!: string;
pageid!: string;
// TODO: Fix this the next time the file is edited.
constructor(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: any}
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: Crawler}
) {
this.workerid = workerid;
this.crawler = crawler;
@ -354,7 +355,12 @@ export class Recorder
let streamingConsume = false;
if (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE) {
const contentType = this._getContentType(responseHeaders);
// stream async response if size is unknown or greater then browser fetch size,
// may potentially not serve in the browser, depending on size.
// except for HTML pages, since need to always load response in browser
if (contentType !== "text/html" && (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE)) {
const opts = {tempdir: this.tempdir, reqresp, expectedSize: contentLen, recorder: this, networkId, cdp};
// fetching using response stream, await here and then either call fulFill, or if not started, return false
@ -397,7 +403,7 @@ export class Recorder
}
}
const rewritten = await this.rewriteResponse(reqresp);
const rewritten = await this.rewriteResponse(reqresp, contentType);
// if in service worker, serialize here
// as won't be getting a loadingFinished message
@ -439,6 +445,7 @@ export class Recorder
startPage({pageid, url} : {pageid: string, url: string}) {
this.pageid = pageid;
this.pageUrl = url;
this.logDetails = {page: url, workerid: this.workerid};
if (this.pendingRequests && this.pendingRequests.size) {
logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
@ -527,8 +534,8 @@ export class Recorder
return false;
}
async rewriteResponse(reqresp: RequestResponseInfo) {
const { url, responseHeadersList, extraOpts, payload } = reqresp;
async rewriteResponse(reqresp: RequestResponseInfo, contentType: string | null) {
const { url, extraOpts, payload } = reqresp;
if (!payload || !payload.length) {
return false;
@ -537,9 +544,7 @@ export class Recorder
let newString = null;
let string = null;
const ct = this._getContentType(responseHeadersList);
switch (ct) {
switch (contentType) {
case "application/x-mpegURL":
case "application/vnd.apple.mpegurl":
string = payload.toString();

View file

@ -65,29 +65,16 @@ export class RequestResponseInfo
this.requestId = requestId;
}
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillRequest(params: Record<string, any>) {
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
this.url = params.request.url;
this.method = params.request.method;
if (!this.requestHeaders) {
this.requestHeaders = params.request.headers;
}
this.postData = params.request.postData;
this.hasPostData = params.request.hasPostData;
this.hasPostData = params.request.hasPostData || false;
if (params.type) {
this.resourceType = params.type;
}
}
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillFetchRequestPaused(params: Record<string, any>) {
this.fillRequest(params);
this.status = params.responseStatusCode;
this.status = params.responseStatusCode || 0;
this.statusText = params.responseStatusText || getStatusText(this.status);
this.responseHeadersList = params.responseHeaders;
@ -147,7 +134,7 @@ export class RequestResponseInfo
}
}
fillResponseReceivedExtraInfo(params: Record<string, string>) {
fillResponseReceivedExtraInfo(params: Protocol.Network.ResponseReceivedExtraInfoEvent) {
// this.responseHeaders = params.headers;
// if (params.headersText) {
// this.responseHeadersText = params.headersText;
@ -155,18 +142,14 @@ export class RequestResponseInfo
this.extraOpts.ipType = params.resourceIPAddressSpace;
}
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillFetchResponse(response: Record<string, any>) {
fillFetchResponse(response: Response) {
this.responseHeaders = Object.fromEntries(response.headers);
this.status = response.status;
this.statusText = response.statusText || getStatusText(this.status);
}
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
fillRequestExtraInfo(params: Record<string, any>) {
fillRequestExtraInfo(params: Protocol.Network.RequestWillBeSentExtraInfoEvent) {
this.requestHeaders = params.headers;
}