From e534f49e5ef1c5159c2d5f90a5190b0f02cc9cc9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 9 Nov 2023 08:55:42 -0800 Subject: [PATCH] recorder: don't do streaming fetch for unknown or large responses if content-type is text/html, always need to load in browser to continue type fixes: remove a few 'any' types in reqresp --- src/util/recorder.ts | 27 ++++++++++++++++----------- src/util/reqresp.ts | 29 ++++++----------------------- 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 88d7cffd..f05a717a 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -20,6 +20,7 @@ import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { WARCWriter } from "./warcwriter.js"; import { RedisCrawlState, WorkerId } from "./state.js"; import { CDPSession, Protocol } from "puppeteer-core"; +import { Crawler } from "../crawler.js"; const MAX_BROWSER_FETCH_SIZE = 2_000_000; const MAX_NETWORK_LOAD_SIZE = 200_000_000; @@ -43,9 +44,8 @@ export class Recorder { workerid: WorkerId; collDir: string; - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - crawler: any; + + crawler: Crawler; crawlState: RedisCrawlState; @@ -74,13 +74,14 @@ export class Recorder writer: WARCWriter; + pageUrl!: string; pageid!: string; // TODO: Fix this the next time the file is edited. constructor( // eslint-disable-next-line @typescript-eslint/no-explicit-any - {workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: any} + {workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: Crawler} ) { this.workerid = workerid; this.crawler = crawler; @@ -354,7 +355,12 @@ export class Recorder let streamingConsume = false; - if (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE) { + const contentType = this._getContentType(responseHeaders); + + // stream async response if size is unknown or greater then browser fetch size, + // may potentially not serve in the browser, depending on size. + // except for HTML pages, since need to always load response in browser + if (contentType !== "text/html" && (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE)) { const opts = {tempdir: this.tempdir, reqresp, expectedSize: contentLen, recorder: this, networkId, cdp}; // fetching using response stream, await here and then either call fulFill, or if not started, return false @@ -397,7 +403,7 @@ export class Recorder } } - const rewritten = await this.rewriteResponse(reqresp); + const rewritten = await this.rewriteResponse(reqresp, contentType); // if in service worker, serialize here // as won't be getting a loadingFinished message @@ -439,6 +445,7 @@ export class Recorder startPage({pageid, url} : {pageid: string, url: string}) { this.pageid = pageid; + this.pageUrl = url; this.logDetails = {page: url, workerid: this.workerid}; if (this.pendingRequests && this.pendingRequests.size) { logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder"); @@ -527,8 +534,8 @@ export class Recorder return false; } - async rewriteResponse(reqresp: RequestResponseInfo) { - const { url, responseHeadersList, extraOpts, payload } = reqresp; + async rewriteResponse(reqresp: RequestResponseInfo, contentType: string | null) { + const { url, extraOpts, payload } = reqresp; if (!payload || !payload.length) { return false; @@ -537,9 +544,7 @@ export class Recorder let newString = null; let string = null; - const ct = this._getContentType(responseHeadersList); - - switch (ct) { + switch (contentType) { case "application/x-mpegURL": case "application/vnd.apple.mpegurl": string = payload.toString(); diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 9d95f94d..78c12ab0 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -65,29 +65,16 @@ export class RequestResponseInfo this.requestId = requestId; } - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - fillRequest(params: Record) { + fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) { this.url = params.request.url; this.method = params.request.method; if (!this.requestHeaders) { this.requestHeaders = params.request.headers; } this.postData = params.request.postData; - this.hasPostData = params.request.hasPostData; + this.hasPostData = params.request.hasPostData || false; - if (params.type) { - this.resourceType = params.type; - } - - } - - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - fillFetchRequestPaused(params: Record) { - this.fillRequest(params); - - this.status = params.responseStatusCode; + this.status = params.responseStatusCode || 0; this.statusText = params.responseStatusText || getStatusText(this.status); this.responseHeadersList = params.responseHeaders; @@ -147,7 +134,7 @@ export class RequestResponseInfo } } - fillResponseReceivedExtraInfo(params: Record) { + fillResponseReceivedExtraInfo(params: Protocol.Network.ResponseReceivedExtraInfoEvent) { // this.responseHeaders = params.headers; // if (params.headersText) { // this.responseHeadersText = params.headersText; @@ -155,18 +142,14 @@ export class RequestResponseInfo this.extraOpts.ipType = params.resourceIPAddressSpace; } - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - fillFetchResponse(response: Record) { + fillFetchResponse(response: Response) { this.responseHeaders = Object.fromEntries(response.headers); this.status = response.status; this.statusText = response.statusText || getStatusText(this.status); } - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - fillRequestExtraInfo(params: Record) { + fillRequestExtraInfo(params: Protocol.Network.RequestWillBeSentExtraInfoEvent) { this.requestHeaders = params.headers; }