mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
recorder: don't do streaming fetch for unknown or large responses if content-type is
text/html, always need to load in browser to continue type fixes: remove a few 'any' types in reqresp
This commit is contained in:
parent
5a6bef890f
commit
e534f49e5e
2 changed files with 22 additions and 34 deletions
|
@ -20,6 +20,7 @@ import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
|||
import { WARCWriter } from "./warcwriter.js";
|
||||
import { RedisCrawlState, WorkerId } from "./state.js";
|
||||
import { CDPSession, Protocol } from "puppeteer-core";
|
||||
import { Crawler } from "../crawler.js";
|
||||
|
||||
const MAX_BROWSER_FETCH_SIZE = 2_000_000;
|
||||
const MAX_NETWORK_LOAD_SIZE = 200_000_000;
|
||||
|
@ -43,9 +44,8 @@ export class Recorder
|
|||
{
|
||||
workerid: WorkerId;
|
||||
collDir: string;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
crawler: any;
|
||||
|
||||
crawler: Crawler;
|
||||
|
||||
crawlState: RedisCrawlState;
|
||||
|
||||
|
@ -74,13 +74,14 @@ export class Recorder
|
|||
|
||||
writer: WARCWriter;
|
||||
|
||||
pageUrl!: string;
|
||||
pageid!: string;
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
||||
constructor(
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: any}
|
||||
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: Crawler}
|
||||
) {
|
||||
this.workerid = workerid;
|
||||
this.crawler = crawler;
|
||||
|
@ -354,7 +355,12 @@ export class Recorder
|
|||
|
||||
let streamingConsume = false;
|
||||
|
||||
if (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE) {
|
||||
const contentType = this._getContentType(responseHeaders);
|
||||
|
||||
// stream async response if size is unknown or greater then browser fetch size,
|
||||
// may potentially not serve in the browser, depending on size.
|
||||
// except for HTML pages, since need to always load response in browser
|
||||
if (contentType !== "text/html" && (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE)) {
|
||||
const opts = {tempdir: this.tempdir, reqresp, expectedSize: contentLen, recorder: this, networkId, cdp};
|
||||
|
||||
// fetching using response stream, await here and then either call fulFill, or if not started, return false
|
||||
|
@ -397,7 +403,7 @@ export class Recorder
|
|||
}
|
||||
}
|
||||
|
||||
const rewritten = await this.rewriteResponse(reqresp);
|
||||
const rewritten = await this.rewriteResponse(reqresp, contentType);
|
||||
|
||||
// if in service worker, serialize here
|
||||
// as won't be getting a loadingFinished message
|
||||
|
@ -439,6 +445,7 @@ export class Recorder
|
|||
|
||||
startPage({pageid, url} : {pageid: string, url: string}) {
|
||||
this.pageid = pageid;
|
||||
this.pageUrl = url;
|
||||
this.logDetails = {page: url, workerid: this.workerid};
|
||||
if (this.pendingRequests && this.pendingRequests.size) {
|
||||
logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
|
||||
|
@ -527,8 +534,8 @@ export class Recorder
|
|||
return false;
|
||||
}
|
||||
|
||||
async rewriteResponse(reqresp: RequestResponseInfo) {
|
||||
const { url, responseHeadersList, extraOpts, payload } = reqresp;
|
||||
async rewriteResponse(reqresp: RequestResponseInfo, contentType: string | null) {
|
||||
const { url, extraOpts, payload } = reqresp;
|
||||
|
||||
if (!payload || !payload.length) {
|
||||
return false;
|
||||
|
@ -537,9 +544,7 @@ export class Recorder
|
|||
let newString = null;
|
||||
let string = null;
|
||||
|
||||
const ct = this._getContentType(responseHeadersList);
|
||||
|
||||
switch (ct) {
|
||||
switch (contentType) {
|
||||
case "application/x-mpegURL":
|
||||
case "application/vnd.apple.mpegurl":
|
||||
string = payload.toString();
|
||||
|
|
|
@ -65,29 +65,16 @@ export class RequestResponseInfo
|
|||
this.requestId = requestId;
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
fillRequest(params: Record<string, any>) {
|
||||
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
|
||||
this.url = params.request.url;
|
||||
this.method = params.request.method;
|
||||
if (!this.requestHeaders) {
|
||||
this.requestHeaders = params.request.headers;
|
||||
}
|
||||
this.postData = params.request.postData;
|
||||
this.hasPostData = params.request.hasPostData;
|
||||
this.hasPostData = params.request.hasPostData || false;
|
||||
|
||||
if (params.type) {
|
||||
this.resourceType = params.type;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
fillFetchRequestPaused(params: Record<string, any>) {
|
||||
this.fillRequest(params);
|
||||
|
||||
this.status = params.responseStatusCode;
|
||||
this.status = params.responseStatusCode || 0;
|
||||
this.statusText = params.responseStatusText || getStatusText(this.status);
|
||||
|
||||
this.responseHeadersList = params.responseHeaders;
|
||||
|
@ -147,7 +134,7 @@ export class RequestResponseInfo
|
|||
}
|
||||
}
|
||||
|
||||
fillResponseReceivedExtraInfo(params: Record<string, string>) {
|
||||
fillResponseReceivedExtraInfo(params: Protocol.Network.ResponseReceivedExtraInfoEvent) {
|
||||
// this.responseHeaders = params.headers;
|
||||
// if (params.headersText) {
|
||||
// this.responseHeadersText = params.headersText;
|
||||
|
@ -155,18 +142,14 @@ export class RequestResponseInfo
|
|||
this.extraOpts.ipType = params.resourceIPAddressSpace;
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
fillFetchResponse(response: Record<string, any>) {
|
||||
fillFetchResponse(response: Response) {
|
||||
this.responseHeaders = Object.fromEntries(response.headers);
|
||||
this.status = response.status;
|
||||
this.statusText = response.statusText || getStatusText(this.status);
|
||||
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
fillRequestExtraInfo(params: Record<string, any>) {
|
||||
fillRequestExtraInfo(params: Protocol.Network.RequestWillBeSentExtraInfoEvent) {
|
||||
this.requestHeaders = params.headers;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue