Load non-HTML resources directly whenever possible (#583)

Optimize the direct loading of non-HTML pages. Currently, the behavior is: - make a HEAD request first - make a direct fetch request only if HEAD request is a non-HTML and 200 - only use fetch request if non-HTML and 200 and doesn't set any cookies This changes the behavior to: - get cookies from browser for page URL - make a direct fetch request with cookies, if provided - only use fetch request if non-HTML and 200 Also: - ensures pageinfo is properly set with timestamp for direct fetch. - remove obsolete Agent handling that is no longer used in default (fetch) If fetch request results in HTML, the response is aborted and browser loading is used.
2025-10-19 14:33:17 +00:00 · 2024-05-24 14:51:51 -07:00 · 2024-05-24 14:51:51 -07:00 · a7d279cfbd
commit a7d279cfbd
parent 089d901b9b
5 changed files with 114 additions and 94 deletions
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -44,27 +44,18 @@ import { Browser } from "./util/browser.js";
 import {
  ADD_LINK_FUNC,
  BEHAVIOR_LOG_FUNC,
-  HTML_TYPES,
  DEFAULT_SELECTORS,
 } from "./util/constants.js";

 import { AdBlockRules, BlockRules } from "./util/blockrules.js";
 import { OriginOverride } from "./util/originoverride.js";

-// to ignore HTTPS error for HEAD check
-import { Agent as HTTPAgent } from "http";
-import { Agent as HTTPSAgent } from "https";
 import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
 import { Recorder } from "./util/recorder.js";
 import { SitemapReader } from "./util/sitemapper.js";
 import { ScopedSeed } from "./util/seeds.js";
 import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
-
-const HTTPS_AGENT = new HTTPSAgent({
-  rejectUnauthorized: false,
-});
-
-const HTTP_AGENT = new HTTPAgent();
+import { isHTMLContentType } from "./util/reqresp.js";

 const behaviors = fs.readFileSync(
  new URL(
@ -781,7 +772,7 @@ self.__bx_behaviors.selectMainBehavior();
  async crawlPage(opts: WorkerState): Promise<void> {
    await this.writeStats();

-    const { page, data, workerid, callbacks, directFetchCapture } = opts;
+    const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
    data.callbacks = callbacks;

    const { url } = data;
@ -790,35 +781,27 @@ self.__bx_behaviors.selectMainBehavior();
    data.logDetails = logDetails;
    data.workerid = workerid;

-    data.isHTMLPage = await timedRun(
-      this.isHTML(url, logDetails),
-      FETCH_TIMEOUT_SECS,
-      "HEAD request to determine if URL is HTML page timed out",
-      logDetails,
-      "fetch",
-      true,
-    );
-
-    if (!data.isHTMLPage && directFetchCapture) {
+    if (directFetchCapture) {
      try {
-        const { fetched, mime } = await timedRun(
-          directFetchCapture(url),
+        const { fetched, mime, ts } = await timedRun(
+          directFetchCapture({ url, headers: this.headers, cdp }),
          FETCH_TIMEOUT_SECS,
          "Direct fetch capture attempt timed out",
          logDetails,
          "fetch",
          true,
        );
+        if (mime) {
+          data.mime = mime;
+          data.isHTMLPage = isHTMLContentType(mime);
+        }
        if (fetched) {
          data.loadState = LoadState.FULL_PAGE_LOADED;
-          if (mime) {
-            data.mime = mime;
-          }
          data.status = 200;
-          data.ts = new Date();
+          data.ts = ts || new Date();
          logger.info(
            "Direct fetch successful",
-            { url, ...logDetails },
+            { url, mime, ...logDetails },
            "fetch",
          );
          return;
@ -1752,7 +1735,7 @@ self.__bx_behaviors.selectMainBehavior();

      const contentType = resp.headers()["content-type"];

-      isHTMLPage = this.isHTMLContentType(contentType);
+      isHTMLPage = isHTMLContentType(contentType);

      if (contentType) {
        data.mime = contentType.split(";")[0];
@ -1878,7 +1861,9 @@ self.__bx_behaviors.selectMainBehavior();
      "behavior",
    );
    try {
-      await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
+      await frame.evaluate(
+        "self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
+      );
    } catch (e) {
      logger.warn("Waiting for custom page load failed", e, "behavior");
    }
@ -2191,49 +2176,6 @@ self.__bx_behaviors.selectMainBehavior();
    }
  }

-  resolveAgent(urlParsed: URL) {
-    return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
-  }
-
-  async isHTML(url: string, logDetails: LogDetails) {
-    try {
-      const resp = await fetch(url, {
-        method: "HEAD",
-        headers: this.headers,
-        agent: this.resolveAgent,
-        // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      } as any);
-      if (resp.status !== 200) {
-        logger.debug("HEAD response code != 200, loading in browser", {
-          status: resp.status,
-          ...logDetails,
-        });
-        return true;
-      }
-
-      return this.isHTMLContentType(resp.headers.get("Content-Type"));
-    } catch (e) {
-      // can't confirm not html, so try in browser
-      logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails });
-      return true;
-    }
-  }
-
-  isHTMLContentType(contentType: string | null) {
-    // just load if no content-type
-    if (!contentType) {
-      return true;
-    }
-
-    const mime = contentType.split(";")[0];
-
-    if (HTML_TYPES.includes(mime)) {
-      return true;
-    }
-
-    return false;
-  }
-
  async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
    if (!sitemap) {
      return;
--- a/src/util/recorder.ts
+++ b/src/util/recorder.ts
@ -6,7 +6,7 @@ import PQueue from "p-queue";

 import { logger, formatErr } from "./logger.js";
 import { sleep, timedRun, timestampNow } from "./timing.js";
-import { RequestResponseInfo } from "./reqresp.js";
+import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";

 // @ts-expect-error TODO fill in why error is expected
 import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
@ -75,11 +75,23 @@ export type AsyncFetchOptions = {
  filter?: (resp: Response) => boolean;
  ignoreDupe?: boolean;
  maxFetchSize?: number;
+  manualRedirect?: boolean;
 };

 // =================================================================
-export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & {
+export type DirectFetchRequest = {
+  url: string;
+  headers: Record<string, string>;
  cdp: CDPSession;
+};
+
+// =================================================================
+export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
+  cdp: CDPSession;
+};
+
+// =================================================================
+export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
  requestId: string;
 };

@ -1062,12 +1074,23 @@ export class Recorder {
    this.writer.writeRecordPair(responseRecord, requestRecord);
  }

-  async directFetchCapture(
-    url: string,
-  ): Promise<{ fetched: boolean; mime: string }> {
+  async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{
+    fetched: boolean;
+    mime: string;
+    ts: Date;
+  }> {
    const reqresp = new RequestResponseInfo("0");
+    const ts = new Date();
+
+    const cookie = await this.getCookieString(cdp, url);
+    if (cookie) {
+      headers["Cookie"] = cookie;
+    }
+
    reqresp.url = url;
    reqresp.method = "GET";
+    reqresp.requestHeaders = headers;
+    reqresp.ts = ts;

    logger.debug(
      "Directly fetching page URL without browser",
@ -1075,8 +1098,21 @@ export class Recorder {
      "recorder",
    );

-    const filter = (resp: Response) =>
-      resp.status === 200 && !resp.headers.get("set-cookie");
+    let mime: string = "";
+
+    const filter = (resp: Response) => {
+      // only direct load 200 responses
+      if (resp.status !== 200) {
+        return false;
+      }
+
+      const ct = resp.headers.get("content-type");
+      if (ct) {
+        mime = ct.split(";")[0];
+      }
+
+      return !isHTMLContentType(mime);
+    };

    // ignore dupes: if previous URL was not a page, still load as page. if previous was page,
    // should not get here, as dupe pages tracked via seen list
@ -1087,16 +1123,28 @@ export class Recorder {
      networkId: "0",
      filter,
      ignoreDupe: true,
+      manualRedirect: true,
    });
    const res = await fetcher.load();

-    const mime =
-      (reqresp.responseHeaders &&
-        reqresp.responseHeaders["content-type"] &&
-        reqresp.responseHeaders["content-type"].split(";")[0]) ||
-      "";
+    this.addPageRecord(reqresp);

-    return { fetched: res === "fetched", mime };
+    if (url === this.pageUrl && !this.pageInfo.ts) {
+      logger.debug("Setting page timestamp", { ts, url });
+      this.pageInfo.ts = ts;
+    }
+
+    return { fetched: res === "fetched", mime, ts };
+  }
+
+  async getCookieString(cdp: CDPSession, url: string) {
+    const cookieList: string[] = [];
+    const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
+    for (const { name, value } of cookies) {
+      cookieList.push(`${name}=${value}`);
+    }
+
+    return cookieList.join(";");
  }
 }

@ -1115,6 +1163,8 @@ class AsyncFetcher {
  tempdir: string;
  filename: string;

+  manualRedirect = false;
+
  constructor({
    tempdir,
    reqresp,
@ -1124,6 +1174,7 @@ class AsyncFetcher {
    filter = undefined,
    ignoreDupe = false,
    maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
+    manualRedirect = false,
  }: AsyncFetchOptions) {
    this.reqresp = reqresp;
    this.reqresp.expectedSize = expectedSize;
@ -1142,6 +1193,8 @@ class AsyncFetcher {
    );

    this.maxFetchSize = maxFetchSize;
+
+    this.manualRedirect = manualRedirect;
  }

  async load() {
@ -1277,9 +1330,9 @@ class AsyncFetcher {
      reqresp.status = 0;
      reqresp.errorText = e.message;
    } finally {
+      recorder.addPageRecord(reqresp);
      // exclude direct fetch request with fake id
      if (networkId !== "0") {
-        recorder.addPageRecord(reqresp);
        recorder.removeReqResp(networkId);
      }
    }
@ -1307,6 +1360,7 @@ class AsyncFetcher {
      headers,
      body: reqresp.postData || undefined,
      signal,
+      redirect: this.manualRedirect ? "manual" : "follow",
    });

    if (this.filter && !this.filter(resp) && abort) {
@ -1323,6 +1377,7 @@ class AsyncFetcher {
    }

    if (reqresp.expectedSize === 0) {
+      reqresp.fillFetchResponse(resp);
      reqresp.payload = new Uint8Array();
      return;
    } else if (!resp.body) {
@ -1422,7 +1477,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
 class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
  cdp: CDPSession;

-  constructor(opts: ResponseStreamAsyncFetchOptions) {
+  constructor(opts: NetworkLoadAsyncFetchOptions) {
    super(opts);
    this.cdp = opts.cdp;
  }
--- a/src/util/reqresp.ts
+++ b/src/util/reqresp.ts
@ -3,6 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js";

 import { Protocol } from "puppeteer-core";
 import { postToGetUrl } from "warcio";
+import { HTML_TYPES } from "./constants.js";

 const CONTENT_LENGTH = "content-length";
 const CONTENT_TYPE = "content-type";
@ -148,10 +149,15 @@ export class RequestResponseInfo {
    }
  }

+  isRedirectStatus() {
+    return this.status >= 300 && this.status < 400 && this.status !== 304;
+  }
+
  isSelfRedirect() {
-    if (this.status < 300 || this.status >= 400 || this.status === 304) {
+    if (!this.isRedirectStatus()) {
      return false;
    }
+
    try {
      const headers = new Headers(this.getResponseHeadersDict());
      const location = headers.get("location") || "";
@ -362,3 +368,18 @@ export class RequestResponseInfo {
    return value.replace(/\n/g, ", ");
  }
 }
+
+export function isHTMLContentType(contentType: string | null) {
+  // just load if no content-type
+  if (!contentType) {
+    return true;
+  }
+
+  const mime = contentType.split(";")[0];
+
+  if (HTML_TYPES.includes(mime)) {
+    return true;
+  }
+
+  return false;
+}
--- a/src/util/state.ts
+++ b/src/util/state.ts
@ -66,7 +66,7 @@ export class PageState {

  callbacks: PageCallbacks = {};

-  isHTMLPage?: boolean;
+  isHTMLPage = true;
  text?: string;
  screenshotView?: Buffer;
  favicon?: string;
--- a/src/util/worker.ts
+++ b/src/util/worker.ts
@ -2,7 +2,7 @@ import os from "os";

 import { logger, formatErr } from "./logger.js";
 import { sleep, timedRun } from "./timing.js";
-import { Recorder } from "./recorder.js";
+import { DirectFetchRequest, Recorder } from "./recorder.js";
 import { rxEscape } from "./seeds.js";
 import { CDPSession, Page } from "puppeteer-core";
 import { PageState, WorkerId } from "./state.js";
@ -20,8 +20,10 @@ export type WorkerOpts = {
  workerid: WorkerId;
  // eslint-disable-next-line @typescript-eslint/ban-types
  callbacks: Record<string, Function>;
-  directFetchCapture?:
-    | ((url: string) => Promise<{ fetched: boolean; mime: string }>)
+  directFetchCapture:
+    | ((
+        request: DirectFetchRequest,
+      ) => Promise<{ fetched: boolean; mime: string; ts: Date }>)
    | null;
  frameIdToExecId: Map<string, number>;
 };
@ -171,7 +173,7 @@ export class PageWorker {
        this.cdp = cdp;
        this.callbacks = {};
        const directFetchCapture = this.recorder
-          ? (x: string) => this.recorder!.directFetchCapture(x)
+          ? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
          : null;
        this.opts = {
          page,