From 88a2fbd0a0ed7c0bb412adf32fc3e35d6a2ccf98 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 17 Jul 2024 13:24:25 -0700 Subject: [PATCH] Fix 206 response + general video handling (#646) Refactors handling of 206 responses: - If a 206 response is encountered, and its actually the full range, convert to 200 and rewrite range and content-range headers to x-range and x-orig-range. This is to support rewriting of 206 responses for DASH manifests - If a partial 206 response starting with `0-`, do a full async fetch separately. - If a partial 206 response not starting with 0-, just ignore (very likely a duplicate picked up when handling the 0- response) - Don't stream content-types that can be rewritten, since streaming prevents rewriting. Fixes rewriting on DASH/HLS manifests which have no content-length and don't get properly rewritten. - Overall, adds missing rewriting of DASH/HLS manifests that have no content-length and are served as 206. - Update to latest wabac.js which fixes rewriting of DASH manifest to avoid duplicate ' { } else if (typeof e === "object") { return e || {}; } else { - return { message: (e as object).toString() }; + return { message: (e as object) + "" }; } } diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 3e1f3ea1..1f14f7cf 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -45,6 +45,17 @@ const WRITE_DUPE_KEY = "s:writedupe"; const MIME_EVENT_STREAM = "text/event-stream"; +const RW_MIME_TYPES = [ + "application/x-mpegURL", + "application/vnd.apple.mpegurl", + "application/dash+xml", + "text/html", + "application/json", + "text/javascript", + "application/javascript", + "application/x-javascript", +]; + const encoder = new TextEncoder(); // ================================================================= @@ -76,7 +87,6 @@ export type PageInfoRecord = { // ================================================================= export type AsyncFetchOptions = { - tempdir: string; reqresp: RequestResponseInfo; expectedSize?: number; // eslint-disable-next-line no-use-before-define @@ -135,8 +145,6 @@ export class Recorder { logDetails: Record = {}; skipping = false; - allowFull206 = false; - tempdir: string; gzip = true; @@ -439,7 +447,6 @@ export class Recorder { "recorder", ); const fetcher = new AsyncFetcher({ - tempdir: this.tempdir, reqresp, recorder: this, networkId: requestId, @@ -572,15 +579,40 @@ export class Recorder { if (responseStatusCode === 206) { const range = this._getContentRange(responseHeaders); - if ( - this.allowFull206 && - range === `bytes 0-${contentLen - 1}/${contentLen}` - ) { + if (range === `bytes 0-${contentLen - 1}/${contentLen}`) { logger.debug( "Keep 206 Response, Full Range", { range, contentLen, url, networkId, ...this.logDetails }, "recorder", ); + } else if (range?.startsWith("bytes 0-")) { + logger.debug( + "Re-request 206 Response without range", + { range, contentLen, url, ...this.logDetails }, + "recorder", + ); + this.removeReqResp(networkId); + + const reqresp = new RequestResponseInfo("0"); + reqresp.fillRequest(params.request, params.resourceType); + if (reqresp.requestHeaders) { + delete reqresp.requestHeaders["range"]; + delete reqresp.requestHeaders["Range"]; + } + reqresp.frameId = params.frameId; + + this.addAsyncFetch( + { + reqresp, + expectedSize: parseInt(range.split("/")[1]), + recorder: this, + networkId: "0", + cdp, + }, + contentLen, + ); + + return false; } else { logger.debug( "Skip 206 Response", @@ -624,16 +656,17 @@ export class Recorder { return false; } + const mimeType = this.getMimeType(responseHeaders) || ""; + let streamingConsume = false; // if contentLength is large or unknown, do streaming, unless its an essential resource // in which case, need to do a full fetch either way if ( (contentLen < 0 || contentLen > MAX_BROWSER_DEFAULT_FETCH_SIZE) && - !this.isEssentialResource(reqresp.resourceType) + !this.isEssentialResource(reqresp.resourceType, mimeType) ) { const opts: ResponseStreamAsyncFetchOptions = { - tempdir: this.tempdir, reqresp, expectedSize: contentLen, recorder: this, @@ -659,14 +692,7 @@ export class Recorder { // if not consumed via takeStream, attempt async loading if (!streamingConsume) { - let fetcher: AsyncFetcher; - - if (reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) { - fetcher = new AsyncFetcher(opts); - } else { - fetcher = new NetworkLoadStreamAsyncFetcher(opts); - } - this.fetcherQ.add(() => fetcher.load()); + this.addAsyncFetch(opts, contentLen); return false; } } else { @@ -698,7 +724,7 @@ export class Recorder { } } - const rewritten = await this.rewriteResponse(reqresp, responseHeaders); + const rewritten = await this.rewriteResponse(reqresp, mimeType); // if in service worker, serialize here // as won't be getting a loadingFinished message @@ -746,6 +772,17 @@ export class Recorder { return true; } + addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) { + let fetcher: AsyncFetcher; + + if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) { + fetcher = new AsyncFetcher(opts); + } else { + fetcher = new NetworkLoadStreamAsyncFetcher(opts); + } + this.fetcherQ.add(() => fetcher.load()); + } + startPage({ pageid, url }: { pageid: string; url: string }) { this.pageid = pageid; this.pageUrl = url; @@ -927,10 +964,7 @@ export class Recorder { return false; } - async rewriteResponse( - reqresp: RequestResponseInfo, - responseHeaders?: Protocol.Fetch.HeaderEntry[], - ) { + async rewriteResponse(reqresp: RequestResponseInfo, contentType: string) { const { url, extraOpts, payload } = reqresp; // don't rewrite if payload is missing or too big @@ -941,8 +975,6 @@ export class Recorder { let newString = null; let string = null; - const contentType = this._getContentType(responseHeaders); - switch (contentType) { case "application/x-mpegURL": case "application/vnd.apple.mpegurl": @@ -983,17 +1015,26 @@ export class Recorder { "recorder", ); reqresp.payload = encoder.encode(newString); + reqresp.isRemoveRange = true; return true; } else { return false; } } - isEssentialResource(resourceType: string | undefined) { - return ["document", "stylesheet", "script"].includes(resourceType || ""); + isEssentialResource(resourceType: string | undefined, contentType: string) { + if (["document", "stylesheet", "script"].includes(resourceType || "")) { + return true; + } + + if (RW_MIME_TYPES.includes(contentType)) { + return true; + } + + return false; } - _getContentType( + protected getMimeType( headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[], ) { if (!headers) { @@ -1008,7 +1049,7 @@ export class Recorder { return null; } - _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) { + protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) { if (!headers) { return -1; } @@ -1120,7 +1161,7 @@ export class Recorder { !isRedirectStatus(status) && !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status)) ) { - logNetwork("Skipping dupe", { url }); + logNetwork("Skipping dupe", { url, status, ...this.logDetails }); return; } @@ -1173,7 +1214,6 @@ export class Recorder { // ignore dupes: if previous URL was not a page, still load as page. if previous was page, // should not get here, as dupe pages tracked via seen list const fetcher = new AsyncFetcher({ - tempdir: this.tempdir, reqresp, recorder: this, networkId: "0", @@ -1231,7 +1271,6 @@ class AsyncFetcher { manualRedirect = false; constructor({ - tempdir, reqresp, expectedSize = -1, recorder, @@ -1251,7 +1290,7 @@ class AsyncFetcher { this.recorder = recorder; - this.tempdir = tempdir; + this.tempdir = recorder.tempdir; this.filename = path.join( this.tempdir, `${timestampNow()}-${uuidv4()}.data`, @@ -1604,7 +1643,7 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher { return; } - reqresp.status = httpStatusCode || 0; + reqresp.setStatus(httpStatusCode || 200); reqresp.responseHeaders = headers || {}; return this.takeStreamIter(cdp, stream); @@ -1618,6 +1657,10 @@ function createResponse( pageid: string, contentIter?: AsyncIterable | Iterable, ) { + if (reqresp.isRemoveRange && reqresp.status === 206) { + reqresp.setStatus(200); + } + const url = reqresp.url; const warcVersion = "WARC/1.1"; const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`; diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 42c692ee..5132168e 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -7,6 +7,8 @@ import { HTML_TYPES } from "./constants.js"; import { Response } from "undici"; const CONTENT_LENGTH = "content-length"; +const CONTENT_RANGE = "content-range"; +const RANGE = "range"; const CONTENT_TYPE = "content-type"; const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"]; @@ -46,6 +48,7 @@ export class RequestResponseInfo { responseHeadersText?: string; payload?: Uint8Array; + isRemoveRange = false; // misc fromServiceWorker = false; @@ -76,11 +79,17 @@ export class RequestResponseInfo { this.requestId = requestId; } + setStatus(status: number, statusText?: string) { + this.status = status; + this.statusText = statusText || getStatusText(this.status); + } + fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) { this.fillRequest(params.request, params.resourceType); - this.status = params.responseStatusCode || 0; - this.statusText = params.responseStatusText || getStatusText(this.status); + if (params.responseStatusCode) { + this.setStatus(params.responseStatusCode, params.responseStatusText); + } this.responseHeadersList = params.responseHeaders; @@ -116,8 +125,7 @@ export class RequestResponseInfo { this.url = response.url.split("#")[0]; - this.status = response.status; - this.statusText = response.statusText || getStatusText(this.status); + this.setStatus(response.status, response.statusText); this.protocol = response.protocol; @@ -182,8 +190,7 @@ export class RequestResponseInfo { fillFetchResponse(response: Response) { this.responseHeaders = Object.fromEntries(response.headers); - this.status = response.status; - this.statusText = response.statusText || getStatusText(this.status); + this.setStatus(response.status, response.statusText); } fillRequestExtraInfo( @@ -240,7 +247,11 @@ export class RequestResponseInfo { headersDict[headerName] = "" + actualContentLength; continue; } - if (EXCLUDE_HEADERS.includes(headerName)) { + if ( + EXCLUDE_HEADERS.includes(headerName) || + (this.isRemoveRange && + (headerName === CONTENT_RANGE || headerName === RANGE)) + ) { headerName = "x-orig-" + headerName; } headersDict[headerName] = this._encodeHeaderValue(header.value); @@ -263,7 +274,11 @@ export class RequestResponseInfo { } const value = this._encodeHeaderValue(headersDict[key]); - if (EXCLUDE_HEADERS.includes(keyLower)) { + if ( + EXCLUDE_HEADERS.includes(keyLower) || + (this.isRemoveRange && + (keyLower === CONTENT_RANGE || keyLower === RANGE)) + ) { headersDict["x-orig-" + key] = value; delete headersDict[key]; } else { @@ -316,11 +331,11 @@ export class RequestResponseInfo { } shouldSkipSave() { - // skip cached, OPTIONS/HEAD responses, and 304 or 206 responses + // skip cached, OPTIONS/HEAD responses, and 304 responses if ( this.fromCache || (this.method && ["OPTIONS", "HEAD"].includes(this.method)) || - [206, 304].includes(this.status) + this.status == 304 ) { return true; } @@ -330,6 +345,17 @@ export class RequestResponseInfo { return true; } + if (this.status === 206) { + const headers = new Headers(this.getResponseHeadersDict()); + const contentLength: number = parseInt( + headers.get(CONTENT_LENGTH) || "0", + ); + const contentRange = headers.get(CONTENT_RANGE); + if (contentRange !== `bytes 0-${contentLength - 1}/${contentLength}`) { + return false; + } + } + return false; } diff --git a/yarn.lock b/yarn.lock index 86f285c0..740ef0a1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1300,21 +1300,21 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.19.1": - version "2.19.1" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.1.tgz#ce0d609f9e90c708af99945e1fa338be0ba2b5f9" - integrity sha512-m8Fi70OkhzkicbcbN5TrrBpj5D/EZKzVp5905kGPoC2F2zLqxUDMzx1FOHt2sTO/1b9NMvBmw9Pk1JQyYEm6rA== +"@webrecorder/wabac@^2.19.4": + version "2.19.4" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.4.tgz#6c91a65928413b8394f17b57f57a803dcb111dbe" + integrity sha512-USWUoreSfgyeYYrC2/o2YYr4dCUSwgOSzbpdapqh90VQ4Fb0fjwPAiessBCH4rA5yd9QpOgWdkapDmXvLx6Bww== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" - "@webrecorder/wombat" "^3.7.8" + "@webrecorder/wombat" "^3.7.11" acorn "^8.10.0" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" brotli "^1.3.3" buffer "^6.0.3" - fast-xml-parser "^4.2.5" + fast-xml-parser "^4.4.0" hash-wasm "^4.9.0" http-link-header "^1.1.3" http-status-codes "^2.1.4" @@ -1329,10 +1329,10 @@ stream-browserify "^3.0.0" warcio "^2.2.1" -"@webrecorder/wombat@^3.7.8": - version "3.7.8" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.8.tgz#a414278b6fbd99bc02a97e384f0373307e60d9fa" - integrity sha512-BmEHrvGLHPQtECmCK9Oz7G3p2StsyaFOlNmAMDSNK/GjqPH+UWZOqDryBkWryTh+pFZXKblqyotLtvR4YxVyeQ== +"@webrecorder/wombat@^3.7.11": + version "3.7.11" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.11.tgz#27539f52317b2d80af4f28d971d59b53bc0f2b96" + integrity sha512-WlGpKjHUpP2aZo/OrY5aduNX/TVdo+hSkzu9as/63wSQ4ZFWIqZ+pxYXci43hjV5oVjcMP4KALLq+V+Fuo8qSA== dependencies: warcio "^2.2.0" @@ -2431,7 +2431,7 @@ fast-xml-parser@^4.2.2: dependencies: strnum "^1.0.5" -fast-xml-parser@^4.2.5: +fast-xml-parser@^4.4.0: version "4.4.0" resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501" integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg==