fix for direct fetch timeouts (#677)

- use '--timeout' value for direct fetch timeout, instead of fixed 30 seconds - don't consider 'document' as essential resource regardless of mime type, as any top-level URL is a document - don't count non-200 responses as non-essential even if missing content-type fixes #676
2025-10-19 06:23:16 +00:00 · 2024-09-05 10:32:31 -07:00 · 2024-09-05 10:32:31 -07:00 · 0d6a0b0efa
commit 0d6a0b0efa
parent 85a07aff18
2 changed files with 4 additions and 2 deletions
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -878,7 +878,7 @@ self.__bx_behaviors.selectMainBehavior();
      try {
        const { fetched, mime, ts } = await timedRun(
          directFetchCapture({ url, headers: this.headers, cdp }),
-          FETCH_TIMEOUT_SECS,
+          this.params.pageLoadTimeout,
          "Direct fetch capture attempt timed out",
          logDetails,
          "fetch",
--- a/src/util/recorder.ts
+++ b/src/util/recorder.ts
@ -670,8 +670,10 @@ export class Recorder {

    // if contentLength is large or unknown, do streaming, unless its an essential resource
    // in which case, need to do a full fetch either way
+    // don't count non-200 responses which may not have content-length
    if (
      (contentLen < 0 || contentLen > MAX_BROWSER_DEFAULT_FETCH_SIZE) &&
+      responseStatusCode === 200 &&
      !this.isEssentialResource(reqresp.resourceType, mimeType)
    ) {
      const opts: ResponseStreamAsyncFetchOptions = {
@ -1030,7 +1032,7 @@ export class Recorder {
  }

  isEssentialResource(resourceType: string | undefined, contentType: string) {
-    if (["document", "stylesheet", "script"].includes(resourceType || "")) {
+    if (resourceType === "script" || resourceType === "stylesheet") {
      return true;
    }