Async Fetch Refactor (#880)

- separate out reading stream response while browser is waiting (not
really async) from actual async loading, this is not handled via
fetchResponseBody()
- unify async fetch into first trying browser networking for regular
GET, fallback to regular fetch()
- load headers and body separately in async fetch, allowing for
cancelling request after headers
- refactor direct fetch of non-html pages: load headers and handle
loading body, adding page async, allowing worker to continue loading
browser-based pages (should allow more parallelization in the future)
- unify WARC writing in preparation for dedup: unified serializeWARC()
called for all paths, WARC digest computed, additional checks for
payload added for streaming loading
This commit is contained in:
Ilya Kreymer 2025-09-10 12:05:21 -07:00 committed by GitHub
parent a42c0b926e
commit 705bc0cd9f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 524 additions and 551 deletions

View file

@ -1059,58 +1059,43 @@ self.__bx_behaviors.selectMainBehavior();
data.logDetails = logDetails; data.logDetails = logDetails;
data.workerid = workerid; data.workerid = workerid;
let result = false;
if (recorder) { if (recorder) {
try { try {
const headers = auth const headers = auth
? { Authorization: auth, ...this.headers } ? { Authorization: auth, ...this.headers }
: this.headers; : this.headers;
const result = await timedRun( result = await timedRun(
recorder.directFetchCapture({ url, headers, cdp }), recorder.directFetchCapture({
url,
headers,
cdp,
state: data,
crawler: this,
}),
this.params.pageLoadTimeout, this.params.pageLoadTimeout,
"Direct fetch of page URL timed out", "Direct fetch of page URL timed out",
logDetails, logDetails,
"fetch", "fetch",
); );
// fetched timed out, already logged, don't retry in browser
if (!result) {
return;
}
const { fetched, mime, ts } = result;
if (mime) {
data.mime = mime;
data.isHTMLPage = isHTMLMime(mime);
}
if (fetched) {
data.loadState = LoadState.FULL_PAGE_LOADED;
data.status = 200;
data.ts = ts || new Date();
logger.info(
"Direct fetch successful",
{ url, mime, ...logDetails },
"fetch",
);
return;
}
} catch (e) { } catch (e) {
if (e instanceof Error && e.message === "response-filtered-out") { logger.error(
// filtered out direct fetch "Direct fetch of page URL failed",
logger.debug( { e, ...logDetails },
"Direct fetch response not accepted, continuing with browser fetch", "fetch",
logDetails, );
"fetch", }
);
} else { if (!result) {
logger.error( logger.debug(
"Direct fetch of page URL failed", "Direct fetch response not accepted, continuing with browser fetch",
{ e, ...logDetails }, logDetails,
"fetch", "fetch",
); );
return; } else {
} return;
} }
} }
@ -1280,6 +1265,10 @@ self.__bx_behaviors.selectMainBehavior();
} }
async pageFinished(data: PageState, lastErrorText = "") { async pageFinished(data: PageState, lastErrorText = "") {
// not yet finished
if (data.asyncLoading) {
return;
}
// if page loaded, considered page finished successfully // if page loaded, considered page finished successfully
// (even if behaviors timed out) // (even if behaviors timed out)
const { loadState, logDetails, depth, url, pageSkipped } = data; const { loadState, logDetails, depth, url, pageSkipped } = data;

File diff suppressed because it is too large Load diff

View file

@ -85,6 +85,7 @@ export class PageState {
skipBehaviors = false; skipBehaviors = false;
pageSkipped = false; pageSkipped = false;
asyncLoading = false;
filteredFrames: Frame[] = []; filteredFrames: Frame[] = [];
loadState: LoadState = LoadState.FAILED; loadState: LoadState = LoadState.FAILED;
contentCheckAllowed = false; contentCheckAllowed = false;

View file

@ -76,7 +76,7 @@ test("PDF: check that the pages.jsonl file entry contains status code and mime t
expect(pageH.loadState).toBe(2); expect(pageH.loadState).toBe(2);
}); });
test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => { test("PDF: check that CDX contains data from two crawls: one pdf 200, one 301 and one 200, two pageinfo entries", () => {
const filedata = fs.readFileSync( const filedata = fs.readFileSync(
"test-crawls/collections/crawl-pdf/indexes/index.cdxj", "test-crawls/collections/crawl-pdf/indexes/index.cdxj",
{ encoding: "utf-8" }, { encoding: "utf-8" },
@ -90,6 +90,7 @@ test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinf
expect(cdxj[0].url).toBe(PDF_HTTP); expect(cdxj[0].url).toBe(PDF_HTTP);
expect(cdxj[0].status).toBe("301"); expect(cdxj[0].status).toBe("301");
// this is duplicated as this is data from two crawls
expect(cdxj[1].url).toBe(PDF); expect(cdxj[1].url).toBe(PDF);
expect(cdxj[1].status).toBe("200"); expect(cdxj[1].status).toBe("200");
expect(cdxj[1].mime).toBe("application/pdf"); expect(cdxj[1].mime).toBe("application/pdf");
@ -149,7 +150,7 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
const lines = filedata.trim().split("\n"); const lines = filedata.trim().split("\n");
const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1); const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
expect(cdxj.length).toBe(6); expect(cdxj.length).toBe(5);
expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico"); expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");
@ -157,18 +158,14 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
expect(cdxj[1].status).toBe("200"); expect(cdxj[1].status).toBe("200");
expect(cdxj[1].mime).toBe("application/xml"); expect(cdxj[1].mime).toBe("application/xml");
expect(cdxj[2].url).toBe(XML); expect(cdxj[2].url).toBe(XML_REDIR);
expect(cdxj[2].status).toBe("200"); expect(cdxj[2].status).toBe("301");
expect(cdxj[2].mime).toBe("application/xml");
expect(cdxj[3].url).toBe(XML_REDIR); expect(cdxj[3].url).toBe("urn:pageinfo:" + XML);
expect(cdxj[3].status).toBe("301"); expect(cdxj[3].mime).toBe("application/json");
expect(cdxj[4].url).toBe("urn:pageinfo:" + XML); expect(cdxj[4].url).toBe("urn:pageinfo:" + XML_REDIR);
expect(cdxj[4].mime).toBe("application/json"); expect(cdxj[4].mime).toBe("application/json");
expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR);
expect(cdxj[5].mime).toBe("application/json");
}); });