qa: filter out non-html pages (#541)

Fixes #540 

Also ensure mime type is set on page for non-html pages when loaded
through browser, already being set for direct fetch path.
This commit is contained in:
Ilya Kreymer 2024-04-12 16:21:50 -07:00 committed by GitHub
parent 8d4e9ca2dc
commit 16671cb610
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 11 additions and 1 deletions

View file

@ -1753,6 +1753,10 @@ self.__bx_behaviors.selectMainBehavior();
const contentType = resp.headers()["content-type"];
isHTMLPage = this.isHTMLContentType(contentType);
if (!isHTMLPage) {
data.mime = contentType.split(";")[0];
}
} catch (e) {
if (!(e instanceof Error)) {
throw e;

View file

@ -41,6 +41,7 @@ type ReplayPage = {
url: string;
ts: number;
id: string;
mime?: string;
};
type ComparisonData = {
@ -234,7 +235,12 @@ export class ReplayCrawler extends Crawler {
}
}
async _addPageIfInScope({ url, ts, id }: ReplayPage, depth: number) {
async _addPageIfInScope({ url, ts, id, mime }: ReplayPage, depth: number) {
if (mime && mime !== "text/html") {
logger.info("Skipping non-HTML page", { url, mime }, "replay");
return;
}
if (this.includeRx.length) {
let inScope = false;
for (const s of this.includeRx) {