qa: filter out non-html pages (#541)

Fixes #540 Also ensure mime type is set on page for non-html pages when loaded through browser, already being set for direct fetch path.
2025-10-19 14:33:17 +00:00 · 2024-04-12 16:21:50 -07:00 · 2024-04-12 16:21:50 -07:00 · 16671cb610
commit 16671cb610
parent 8d4e9ca2dc
2 changed files with 11 additions and 1 deletions
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -1753,6 +1753,10 @@ self.__bx_behaviors.selectMainBehavior();
      const contentType = resp.headers()["content-type"];

      isHTMLPage = this.isHTMLContentType(contentType);
+
+      if (!isHTMLPage) {
+        data.mime = contentType.split(";")[0];
+      }
    } catch (e) {
      if (!(e instanceof Error)) {
        throw e;
--- a/src/replaycrawler.ts
+++ b/src/replaycrawler.ts
@ -41,6 +41,7 @@ type ReplayPage = {
  url: string;
  ts: number;
  id: string;
+  mime?: string;
 };

 type ComparisonData = {
@ -234,7 +235,12 @@ export class ReplayCrawler extends Crawler {
    }
  }

-  async _addPageIfInScope({ url, ts, id }: ReplayPage, depth: number) {
+  async _addPageIfInScope({ url, ts, id, mime }: ReplayPage, depth: number) {
+    if (mime && mime !== "text/html") {
+      logger.info("Skipping non-HTML page", { url, mime }, "replay");
+      return;
+    }
+
    if (this.includeRx.length) {
      let inScope = false;
      for (const s of this.includeRx) {