mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
async fetch: allow retrying async fetch if interrupted (#863)
- retry if 'truncated' set, or if size mismatch, or other exception occurs - retry only for network load and async fetch, not for response fetch - set max retries to 2 (same as default for pages currently) - fixes #831
This commit is contained in:
parent
c84f58f539
commit
6244515818
1 changed files with 154 additions and 106 deletions
|
@ -26,6 +26,7 @@ import { Crawler } from "../crawler.js";
|
|||
import { getProxyDispatcher } from "./proxy.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import EventEmitter from "events";
|
||||
import { DEFAULT_MAX_RETRIES } from "./constants.js";
|
||||
|
||||
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
|
||||
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
|
||||
|
@ -1510,6 +1511,8 @@ class AsyncFetcher {
|
|||
|
||||
manualRedirect = false;
|
||||
|
||||
maxRetries = DEFAULT_MAX_RETRIES;
|
||||
|
||||
constructor({
|
||||
reqresp,
|
||||
expectedSize = -1,
|
||||
|
@ -1555,6 +1558,11 @@ class AsyncFetcher {
|
|||
}
|
||||
}
|
||||
|
||||
let retries = 0;
|
||||
|
||||
while (retries <= this.maxRetries) {
|
||||
try {
|
||||
reqresp.truncated = undefined;
|
||||
const body = await this._doFetch();
|
||||
fetched = "fetched";
|
||||
|
||||
|
@ -1574,6 +1582,18 @@ class AsyncFetcher {
|
|||
reqresp.readSize = readSize;
|
||||
// set truncated field and recompute header buff
|
||||
if (reqresp.truncated) {
|
||||
const retry = retries < this.maxRetries;
|
||||
logger.warn(
|
||||
"Response truncated",
|
||||
{ url, retry, ...logDetails },
|
||||
"recorder",
|
||||
);
|
||||
// if retries available, just retry
|
||||
if (retry) {
|
||||
void serializer.externalBuffer?.purge();
|
||||
retries++;
|
||||
continue;
|
||||
}
|
||||
responseRecord.warcHeaders.headers.set(
|
||||
"WARC-Truncated",
|
||||
reqresp.truncated,
|
||||
|
@ -1584,11 +1604,17 @@ class AsyncFetcher {
|
|||
);
|
||||
}
|
||||
} catch (e) {
|
||||
const retry = retries < this.maxRetries;
|
||||
logger.error(
|
||||
"Error reading + digesting payload",
|
||||
{ url, ...formatErr(e), ...logDetails },
|
||||
{ url, retry, ...formatErr(e), ...logDetails },
|
||||
"recorder",
|
||||
);
|
||||
if (retry) {
|
||||
void serializer.externalBuffer?.purge();
|
||||
retries++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
|
@ -1614,6 +1640,9 @@ class AsyncFetcher {
|
|||
size: reqresp.readSize,
|
||||
expected: reqresp.expectedSize,
|
||||
url,
|
||||
retry:
|
||||
retries < this.maxRetries &&
|
||||
(status === 206 || status === 200),
|
||||
...logDetails,
|
||||
},
|
||||
"recorder",
|
||||
|
@ -1621,6 +1650,10 @@ class AsyncFetcher {
|
|||
if (status === 206 || status === 200) {
|
||||
void serializer.externalBuffer?.purge();
|
||||
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
|
||||
if (retries < this.maxRetries) {
|
||||
retries++;
|
||||
continue;
|
||||
}
|
||||
return "notfetched";
|
||||
}
|
||||
}
|
||||
|
@ -1638,7 +1671,11 @@ class AsyncFetcher {
|
|||
} else if (fh) {
|
||||
logger.debug(
|
||||
"Large payload written to WARC, but not returned to browser (would require rereading into memory)",
|
||||
{ url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize },
|
||||
{
|
||||
url,
|
||||
actualSize: reqresp.readSize,
|
||||
maxSize: this.maxFetchSize,
|
||||
},
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
@ -1663,14 +1700,23 @@ class AsyncFetcher {
|
|||
if (e.message === "response-filtered-out") {
|
||||
throw e;
|
||||
}
|
||||
const retry = retries < this.maxRetries;
|
||||
logger.debug(
|
||||
"Streaming Fetch Error",
|
||||
{ url, networkId, ...formatErr(e), ...logDetails },
|
||||
{ url, networkId, retry, ...formatErr(e), ...logDetails },
|
||||
"recorder",
|
||||
);
|
||||
if (retry) {
|
||||
retries++;
|
||||
continue;
|
||||
}
|
||||
// indicate response is ultimately not valid
|
||||
reqresp.status = 0;
|
||||
reqresp.errorText = e.message;
|
||||
}
|
||||
// if we get here, successful (or out of retries), break out of loop
|
||||
break;
|
||||
}
|
||||
} finally {
|
||||
recorder.addPageRecord(reqresp);
|
||||
// exclude direct fetch request with fake id
|
||||
|
@ -1811,6 +1857,8 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
|
|||
super(opts);
|
||||
this.cdp = opts.cdp;
|
||||
this.requestId = opts.requestId;
|
||||
// can't retry this type of fetch
|
||||
this.maxRetries = 0;
|
||||
}
|
||||
|
||||
async _doFetch() {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue