mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
qa: filter out non-html pages (#541)
Fixes #540 Also ensure mime type is set on page for non-html pages when loaded through browser, already being set for direct fetch path.
This commit is contained in:
parent
8d4e9ca2dc
commit
16671cb610
2 changed files with 11 additions and 1 deletions
|
@ -1753,6 +1753,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const contentType = resp.headers()["content-type"];
|
const contentType = resp.headers()["content-type"];
|
||||||
|
|
||||||
isHTMLPage = this.isHTMLContentType(contentType);
|
isHTMLPage = this.isHTMLContentType(contentType);
|
||||||
|
|
||||||
|
if (!isHTMLPage) {
|
||||||
|
data.mime = contentType.split(";")[0];
|
||||||
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (!(e instanceof Error)) {
|
if (!(e instanceof Error)) {
|
||||||
throw e;
|
throw e;
|
||||||
|
|
|
@ -41,6 +41,7 @@ type ReplayPage = {
|
||||||
url: string;
|
url: string;
|
||||||
ts: number;
|
ts: number;
|
||||||
id: string;
|
id: string;
|
||||||
|
mime?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
type ComparisonData = {
|
type ComparisonData = {
|
||||||
|
@ -234,7 +235,12 @@ export class ReplayCrawler extends Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async _addPageIfInScope({ url, ts, id }: ReplayPage, depth: number) {
|
async _addPageIfInScope({ url, ts, id, mime }: ReplayPage, depth: number) {
|
||||||
|
if (mime && mime !== "text/html") {
|
||||||
|
logger.info("Skipping non-HTML page", { url, mime }, "replay");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (this.includeRx.length) {
|
if (this.includeRx.length) {
|
||||||
let inScope = false;
|
let inScope = false;
|
||||||
for (const s of this.includeRx) {
|
for (const s of this.includeRx) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue