Ensure partial responses are not written (#721)

various fixes for streaming, especially related to range requests
- follow up to #709
- fix: prefer streaming current response via takeStream, not only when
size is unknown
- don't serialize async responses prematurely
- don't serialize 206 responses if there is size mismatch
This commit is contained in:
Ilya Kreymer 2024-11-13 23:28:37 -08:00 committed by GitHub
parent f56d6505c1
commit 0b9cd71c5a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 30 additions and 20 deletions

View file

@ -697,8 +697,8 @@ export class Recorder {
requestId,
};
// fetching using response stream, await here and then either call fulFill, or if not started, return false
if (contentLen < 0) {
// fetching using response stream as first attempt,
// await here and then either call fulFill, or if dupe, return false
const fetcher = new ResponseStreamAsyncFetcher(opts);
const res = await fetcher.load();
switch (res) {
@ -710,7 +710,6 @@ export class Recorder {
streamingConsume = true;
break;
}
}
// if not consumed via takeStream, attempt async loading
if (!streamingConsume) {
@ -750,7 +749,12 @@ export class Recorder {
// if in browser context, and not also intercepted in page context
// serialize here, as won't be getting a loadingFinished message for it
if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) {
if (
isBrowserContext &&
!reqresp.inPageContext &&
!reqresp.asyncLoading &&
reqresp.payload
) {
this.removeReqResp(networkId);
await this.serializeToWARC(reqresp);
}
@ -788,7 +792,7 @@ export class Recorder {
? "document not loaded in browser, possibly other URLs missing"
: "URL not loaded in browser";
logger.debug(msg, { url, resourceType }, "recorder");
logger.debug(msg, { url, resourceType, e }, "recorder");
}
return true;
@ -797,7 +801,11 @@ export class Recorder {
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) {
let fetcher: AsyncFetcher;
if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
if (
opts.reqresp.method !== "GET" ||
contentLen > MAX_NETWORK_LOAD_SIZE ||
!opts.reqresp.inPageContext
) {
fetcher = new AsyncFetcher(opts);
} else {
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
@ -866,7 +874,7 @@ export class Recorder {
async awaitPageResources() {
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
if (reqresp.payload) {
if (reqresp.payload && !reqresp.asyncLoading) {
this.removeReqResp(requestId);
await this.serializeToWARC(reqresp);
// if no url, and not fetch intercept or async loading,
@ -1455,8 +1463,10 @@ class AsyncFetcher {
},
"recorder",
);
//await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
//return fetched;
if (status === 206) {
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
return "notfetched";
}
}
const externalBuffer: TempFileBuffer =

View file

@ -438,9 +438,9 @@ export async function runWorkers(
await Promise.allSettled(workers.map((worker) => worker.run()));
await crawler.browser.close();
await closeWorkers();
await crawler.browser.close();
}
// ===========================================================================