mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Ensure partial responses are not written (#721)
various fixes for streaming, especially related to range requests - follow up to #709 - fix: prefer streaming current response via takeStream, not only when size is unknown - don't serialize async responses prematurely - don't serialize 206 responses if there is size mismatch
This commit is contained in:
parent
f56d6505c1
commit
0b9cd71c5a
2 changed files with 30 additions and 20 deletions
|
@ -697,19 +697,18 @@ export class Recorder {
|
|||
requestId,
|
||||
};
|
||||
|
||||
// fetching using response stream, await here and then either call fulFill, or if not started, return false
|
||||
if (contentLen < 0) {
|
||||
const fetcher = new ResponseStreamAsyncFetcher(opts);
|
||||
const res = await fetcher.load();
|
||||
switch (res) {
|
||||
case "dupe":
|
||||
this.removeReqResp(networkId);
|
||||
return false;
|
||||
// fetching using response stream as first attempt,
|
||||
// await here and then either call fulFill, or if dupe, return false
|
||||
const fetcher = new ResponseStreamAsyncFetcher(opts);
|
||||
const res = await fetcher.load();
|
||||
switch (res) {
|
||||
case "dupe":
|
||||
this.removeReqResp(networkId);
|
||||
return false;
|
||||
|
||||
case "fetched":
|
||||
streamingConsume = true;
|
||||
break;
|
||||
}
|
||||
case "fetched":
|
||||
streamingConsume = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// if not consumed via takeStream, attempt async loading
|
||||
|
@ -750,7 +749,12 @@ export class Recorder {
|
|||
|
||||
// if in browser context, and not also intercepted in page context
|
||||
// serialize here, as won't be getting a loadingFinished message for it
|
||||
if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) {
|
||||
if (
|
||||
isBrowserContext &&
|
||||
!reqresp.inPageContext &&
|
||||
!reqresp.asyncLoading &&
|
||||
reqresp.payload
|
||||
) {
|
||||
this.removeReqResp(networkId);
|
||||
await this.serializeToWARC(reqresp);
|
||||
}
|
||||
|
@ -788,7 +792,7 @@ export class Recorder {
|
|||
? "document not loaded in browser, possibly other URLs missing"
|
||||
: "URL not loaded in browser";
|
||||
|
||||
logger.debug(msg, { url, resourceType }, "recorder");
|
||||
logger.debug(msg, { url, resourceType, e }, "recorder");
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -797,7 +801,11 @@ export class Recorder {
|
|||
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) {
|
||||
let fetcher: AsyncFetcher;
|
||||
|
||||
if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
|
||||
if (
|
||||
opts.reqresp.method !== "GET" ||
|
||||
contentLen > MAX_NETWORK_LOAD_SIZE ||
|
||||
!opts.reqresp.inPageContext
|
||||
) {
|
||||
fetcher = new AsyncFetcher(opts);
|
||||
} else {
|
||||
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
|
||||
|
@ -866,7 +874,7 @@ export class Recorder {
|
|||
|
||||
async awaitPageResources() {
|
||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||
if (reqresp.payload) {
|
||||
if (reqresp.payload && !reqresp.asyncLoading) {
|
||||
this.removeReqResp(requestId);
|
||||
await this.serializeToWARC(reqresp);
|
||||
// if no url, and not fetch intercept or async loading,
|
||||
|
@ -1455,8 +1463,10 @@ class AsyncFetcher {
|
|||
},
|
||||
"recorder",
|
||||
);
|
||||
//await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
|
||||
//return fetched;
|
||||
if (status === 206) {
|
||||
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
|
||||
return "notfetched";
|
||||
}
|
||||
}
|
||||
|
||||
const externalBuffer: TempFileBuffer =
|
||||
|
|
|
@ -438,9 +438,9 @@ export async function runWorkers(
|
|||
|
||||
await Promise.allSettled(workers.map((worker) => worker.run()));
|
||||
|
||||
await crawler.browser.close();
|
||||
|
||||
await closeWorkers();
|
||||
|
||||
await crawler.browser.close();
|
||||
}
|
||||
|
||||
// ===========================================================================
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue