mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
qa: filter out non-html pages (#541)
Fixes #540 Also ensure mime type is set on page for non-html pages when loaded through browser, already being set for direct fetch path.
This commit is contained in:
parent
8d4e9ca2dc
commit
16671cb610
2 changed files with 11 additions and 1 deletions
|
@ -1753,6 +1753,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const contentType = resp.headers()["content-type"];
|
||||
|
||||
isHTMLPage = this.isHTMLContentType(contentType);
|
||||
|
||||
if (!isHTMLPage) {
|
||||
data.mime = contentType.split(";")[0];
|
||||
}
|
||||
} catch (e) {
|
||||
if (!(e instanceof Error)) {
|
||||
throw e;
|
||||
|
|
|
@ -41,6 +41,7 @@ type ReplayPage = {
|
|||
url: string;
|
||||
ts: number;
|
||||
id: string;
|
||||
mime?: string;
|
||||
};
|
||||
|
||||
type ComparisonData = {
|
||||
|
@ -234,7 +235,12 @@ export class ReplayCrawler extends Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async _addPageIfInScope({ url, ts, id }: ReplayPage, depth: number) {
|
||||
async _addPageIfInScope({ url, ts, id, mime }: ReplayPage, depth: number) {
|
||||
if (mime && mime !== "text/html") {
|
||||
logger.info("Skipping non-HTML page", { url, mime }, "replay");
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.includeRx.length) {
|
||||
let inScope = false;
|
||||
for (const s of this.includeRx) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue