Async Fetch Refactor (#880)

- separate out reading stream response while browser is waiting (not really async) from actual async loading, this is not handled via fetchResponseBody() - unify async fetch into first trying browser networking for regular GET, fallback to regular fetch() - load headers and body separately in async fetch, allowing for cancelling request after headers - refactor direct fetch of non-html pages: load headers and handle loading body, adding page async, allowing worker to continue loading browser-based pages (should allow more parallelization in the future) - unify WARC writing in preparation for dedup: unified serializeWARC() called for all paths, WARC digest computed, additional checks for payload added for streaming loading
2025-10-19 06:23:16 +00:00 · 2025-09-10 12:05:21 -07:00 · 2025-09-10 12:05:21 -07:00 · 705bc0cd9f
commit 705bc0cd9f
parent a42c0b926e
4 changed files with 524 additions and 551 deletions
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -1059,58 +1059,43 @@ self.__bx_behaviors.selectMainBehavior();
    data.logDetails = logDetails;
    data.workerid = workerid;

+    let result = false;
+
    if (recorder) {
      try {
        const headers = auth
          ? { Authorization: auth, ...this.headers }
          : this.headers;

-        const result = await timedRun(
-          recorder.directFetchCapture({ url, headers, cdp }),
+        result = await timedRun(
+          recorder.directFetchCapture({
+            url,
+            headers,
+            cdp,
+            state: data,
+            crawler: this,
+          }),
          this.params.pageLoadTimeout,
          "Direct fetch of page URL timed out",
          logDetails,
          "fetch",
        );
-
-        // fetched timed out, already logged, don't retry in browser
-        if (!result) {
-          return;
-        }
-
-        const { fetched, mime, ts } = result;
-
-        if (mime) {
-          data.mime = mime;
-          data.isHTMLPage = isHTMLMime(mime);
-        }
-        if (fetched) {
-          data.loadState = LoadState.FULL_PAGE_LOADED;
-          data.status = 200;
-          data.ts = ts || new Date();
-          logger.info(
-            "Direct fetch successful",
-            { url, mime, ...logDetails },
-            "fetch",
-          );
-          return;
-        }
      } catch (e) {
-        if (e instanceof Error && e.message === "response-filtered-out") {
-          // filtered out direct fetch
-          logger.debug(
-            "Direct fetch response not accepted, continuing with browser fetch",
-            logDetails,
-            "fetch",
-          );
-        } else {
-          logger.error(
-            "Direct fetch of page URL failed",
-            { e, ...logDetails },
-            "fetch",
-          );
-          return;
-        }
+        logger.error(
+          "Direct fetch of page URL failed",
+          { e, ...logDetails },
+          "fetch",
+        );
+      }
+
+      if (!result) {
+        logger.debug(
+          "Direct fetch response not accepted, continuing with browser fetch",
+          logDetails,
+          "fetch",
+        );
+      } else {
+        return;
      }
    }

@ -1280,6 +1265,10 @@ self.__bx_behaviors.selectMainBehavior();
  }

  async pageFinished(data: PageState, lastErrorText = "") {
+    // not yet finished
+    if (data.asyncLoading) {
+      return;
+    }
    // if page loaded, considered page finished successfully
    // (even if behaviors timed out)
    const { loadState, logDetails, depth, url, pageSkipped } = data;
--- a/src/util/recorder.ts
+++ b/src/util/recorder.ts
--- a/src/util/state.ts
+++ b/src/util/state.ts
@ -85,6 +85,7 @@ export class PageState {

  skipBehaviors = false;
  pageSkipped = false;
+  asyncLoading = false;
  filteredFrames: Frame[] = [];
  loadState: LoadState = LoadState.FAILED;
  contentCheckAllowed = false;
--- a/tests/non-html-crawl.test.js
+++ b/tests/non-html-crawl.test.js
@ -76,7 +76,7 @@ test("PDF: check that the pages.jsonl file entry contains status code and mime t
  expect(pageH.loadState).toBe(2);
 });

-test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => {
+test("PDF: check that CDX contains data from two crawls: one pdf 200, one 301 and one 200, two pageinfo entries", () => {
  const filedata = fs.readFileSync(
    "test-crawls/collections/crawl-pdf/indexes/index.cdxj",
    { encoding: "utf-8" },
@ -90,6 +90,7 @@ test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinf
  expect(cdxj[0].url).toBe(PDF_HTTP);
  expect(cdxj[0].status).toBe("301");

+  // this is duplicated as this is data from two crawls
  expect(cdxj[1].url).toBe(PDF);
  expect(cdxj[1].status).toBe("200");
  expect(cdxj[1].mime).toBe("application/pdf");
@ -149,7 +150,7 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
  const lines = filedata.trim().split("\n");
  const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);

-  expect(cdxj.length).toBe(6);
+  expect(cdxj.length).toBe(5);

  expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");

@ -157,18 +158,14 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
  expect(cdxj[1].status).toBe("200");
  expect(cdxj[1].mime).toBe("application/xml");

-  expect(cdxj[2].url).toBe(XML);
-  expect(cdxj[2].status).toBe("200");
-  expect(cdxj[2].mime).toBe("application/xml");
+  expect(cdxj[2].url).toBe(XML_REDIR);
+  expect(cdxj[2].status).toBe("301");

-  expect(cdxj[3].url).toBe(XML_REDIR);
-  expect(cdxj[3].status).toBe("301");
+  expect(cdxj[3].url).toBe("urn:pageinfo:" + XML);
+  expect(cdxj[3].mime).toBe("application/json");

-  expect(cdxj[4].url).toBe("urn:pageinfo:" + XML);
+  expect(cdxj[4].url).toBe("urn:pageinfo:" + XML_REDIR);
  expect(cdxj[4].mime).toBe("application/json");
-
-  expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR);
-  expect(cdxj[5].mime).toBe("application/json");
 });