async fetch: allow retrying async fetch if interrupted (#863)

- retry if 'truncated' set, or if size mismatch, or other exception
occurs
- retry only for network load and async fetch, not for response fetch
- set max retries to 2 (same as default for pages currently)
- fixes #831
This commit is contained in:
Ilya Kreymer 2025-07-08 10:02:09 -07:00 committed by GitHub
parent c84f58f539
commit 6244515818
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -26,6 +26,7 @@ import { Crawler } from "../crawler.js";
import { getProxyDispatcher } from "./proxy.js";
import { ScopedSeed } from "./seeds.js";
import EventEmitter from "events";
import { DEFAULT_MAX_RETRIES } from "./constants.js";
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
@ -1510,6 +1511,8 @@ class AsyncFetcher {
manualRedirect = false;
maxRetries = DEFAULT_MAX_RETRIES;
constructor({
reqresp,
expectedSize = -1,
@ -1555,6 +1558,11 @@ class AsyncFetcher {
}
}
let retries = 0;
while (retries <= this.maxRetries) {
try {
reqresp.truncated = undefined;
const body = await this._doFetch();
fetched = "fetched";
@ -1574,6 +1582,18 @@ class AsyncFetcher {
reqresp.readSize = readSize;
// set truncated field and recompute header buff
if (reqresp.truncated) {
const retry = retries < this.maxRetries;
logger.warn(
"Response truncated",
{ url, retry, ...logDetails },
"recorder",
);
// if retries available, just retry
if (retry) {
void serializer.externalBuffer?.purge();
retries++;
continue;
}
responseRecord.warcHeaders.headers.set(
"WARC-Truncated",
reqresp.truncated,
@ -1584,11 +1604,17 @@ class AsyncFetcher {
);
}
} catch (e) {
const retry = retries < this.maxRetries;
logger.error(
"Error reading + digesting payload",
{ url, ...formatErr(e), ...logDetails },
{ url, retry, ...formatErr(e), ...logDetails },
"recorder",
);
if (retry) {
void serializer.externalBuffer?.purge();
retries++;
continue;
}
}
if (
@ -1614,6 +1640,9 @@ class AsyncFetcher {
size: reqresp.readSize,
expected: reqresp.expectedSize,
url,
retry:
retries < this.maxRetries &&
(status === 206 || status === 200),
...logDetails,
},
"recorder",
@ -1621,6 +1650,10 @@ class AsyncFetcher {
if (status === 206 || status === 200) {
void serializer.externalBuffer?.purge();
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
if (retries < this.maxRetries) {
retries++;
continue;
}
return "notfetched";
}
}
@ -1638,7 +1671,11 @@ class AsyncFetcher {
} else if (fh) {
logger.debug(
"Large payload written to WARC, but not returned to browser (would require rereading into memory)",
{ url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize },
{
url,
actualSize: reqresp.readSize,
maxSize: this.maxFetchSize,
},
"recorder",
);
}
@ -1663,14 +1700,23 @@ class AsyncFetcher {
if (e.message === "response-filtered-out") {
throw e;
}
const retry = retries < this.maxRetries;
logger.debug(
"Streaming Fetch Error",
{ url, networkId, ...formatErr(e), ...logDetails },
{ url, networkId, retry, ...formatErr(e), ...logDetails },
"recorder",
);
if (retry) {
retries++;
continue;
}
// indicate response is ultimately not valid
reqresp.status = 0;
reqresp.errorText = e.message;
}
// if we get here, successful (or out of retries), break out of loop
break;
}
} finally {
recorder.addPageRecord(reqresp);
// exclude direct fetch request with fake id
@ -1811,6 +1857,8 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
super(opts);
this.cdp = opts.cdp;
this.requestId = opts.requestId;
// can't retry this type of fetch
this.maxRetries = 0;
}
async _doFetch() {