mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Include resource type + mime type in page resources list (#468)
The `:pageinfo:<url>` record now includes the mime type + resource type (from Chrome) along with status code for each resource, for better filtering / comparison.
This commit is contained in:
parent
8d2d79a5df
commit
a512e92886
3 changed files with 139 additions and 37 deletions
|
@ -48,10 +48,17 @@ function logNetwork(msg: string, data: any) {
|
|||
// logger.debug(msg, data, "recorderNetwork");
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
export type PageInfoValue = {
|
||||
status: number;
|
||||
mime?: string;
|
||||
type?: string;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
export type PageInfoRecord = {
|
||||
pageid: string;
|
||||
urls: Record<string, number>;
|
||||
urls: Record<string, PageInfoValue>;
|
||||
url: string;
|
||||
ts?: Date;
|
||||
};
|
||||
|
@ -190,7 +197,7 @@ export class Recorder {
|
|||
});
|
||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||
if (reqresp) {
|
||||
this.addPageRecord(reqresp);
|
||||
reqresp.resourceType = params.this.addPageRecord(reqresp);
|
||||
|
||||
this.removeReqResp(params.requestId);
|
||||
}
|
||||
|
@ -250,7 +257,7 @@ export class Recorder {
|
|||
}
|
||||
|
||||
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
||||
const { requestId, response } = params;
|
||||
const { requestId, response, type } = params;
|
||||
|
||||
const { mimeType } = response;
|
||||
|
||||
|
@ -263,7 +270,7 @@ export class Recorder {
|
|||
return;
|
||||
}
|
||||
|
||||
reqresp.fillResponse(response);
|
||||
reqresp.fillResponse(response, type);
|
||||
|
||||
this.addPageRecord(reqresp);
|
||||
}
|
||||
|
@ -280,7 +287,7 @@ export class Recorder {
|
|||
}
|
||||
|
||||
handleRedirectResponse(params: Protocol.Network.RequestWillBeSentEvent) {
|
||||
const { requestId, redirectResponse } = params;
|
||||
const { requestId, redirectResponse, type } = params;
|
||||
|
||||
// remove and serialize, but allow reusing requestId
|
||||
// as redirect chain may reuse same requestId for subsequent request
|
||||
|
@ -289,7 +296,7 @@ export class Recorder {
|
|||
return;
|
||||
}
|
||||
|
||||
reqresp.fillResponse(redirectResponse);
|
||||
reqresp.fillResponse(redirectResponse, type);
|
||||
|
||||
if (reqresp.isSelfRedirect()) {
|
||||
logger.warn(
|
||||
|
@ -312,6 +319,7 @@ export class Recorder {
|
|||
}
|
||||
|
||||
const { url } = reqresp;
|
||||
reqresp.resourceType = type;
|
||||
|
||||
switch (errorText) {
|
||||
case "net::ERR_BLOCKED_BY_CLIENT":
|
||||
|
@ -648,8 +656,10 @@ export class Recorder {
|
|||
}
|
||||
|
||||
addPageRecord(reqresp: RequestResponseInfo) {
|
||||
if (this.isValidUrl(this.pageInfo.url)) {
|
||||
this.pageInfo.urls[reqresp.getCanonURL()] = reqresp.status;
|
||||
if (this.isValidUrl(reqresp.url)) {
|
||||
const { status, resourceType: type } = reqresp;
|
||||
const mime = reqresp.getMimeType();
|
||||
this.pageInfo.urls[reqresp.getCanonURL()] = { status, mime, type };
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ const CONTENT_TYPE = "content-type";
|
|||
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
||||
|
||||
// max URL length for post/put payload-converted URLs
|
||||
const MAX_URL_LENGTH = 4096;
|
||||
export const MAX_URL_LENGTH = 4096;
|
||||
|
||||
// max length for single query arg for post/put converted URLs
|
||||
const MAX_ARG_LEN = 512;
|
||||
|
@ -24,6 +24,8 @@ export class RequestResponseInfo {
|
|||
url!: string;
|
||||
protocol?: string = "HTTP/1.1";
|
||||
|
||||
mimeType?: string;
|
||||
|
||||
// request data
|
||||
requestHeaders?: Record<string, string>;
|
||||
requestHeadersText?: string;
|
||||
|
@ -88,7 +90,7 @@ export class RequestResponseInfo {
|
|||
this.frameId = params.frameId;
|
||||
}
|
||||
|
||||
fillResponse(response: Protocol.Network.Response) {
|
||||
fillResponse(response: Protocol.Network.Response, type?: string) {
|
||||
// if initial fetch was a 200, but now replacing with 304, don't!
|
||||
if (
|
||||
response.status == 304 &&
|
||||
|
@ -106,6 +108,10 @@ export class RequestResponseInfo {
|
|||
|
||||
this.protocol = response.protocol;
|
||||
|
||||
if (type) {
|
||||
this.resourceType = type;
|
||||
}
|
||||
|
||||
if (response.requestHeaders) {
|
||||
this.requestHeaders = response.requestHeaders;
|
||||
}
|
||||
|
@ -246,6 +252,21 @@ export class RequestResponseInfo {
|
|||
return headersDict;
|
||||
}
|
||||
|
||||
getMimeType() {
|
||||
if (this.mimeType) {
|
||||
return this.mimeType;
|
||||
}
|
||||
|
||||
const headers = new Headers(this.getResponseHeadersDict());
|
||||
const contentType = headers.get(CONTENT_TYPE);
|
||||
|
||||
if (!contentType) {
|
||||
return;
|
||||
}
|
||||
|
||||
return contentType.split(";")[0];
|
||||
}
|
||||
|
||||
isValidBinary() {
|
||||
if (!this.payload) {
|
||||
return false;
|
||||
|
|
|
@ -52,22 +52,71 @@ function validateResourcesIndex(json) {
|
|||
expect(json).toHaveProperty("ts");
|
||||
expect(json).toHaveProperty("urls");
|
||||
expect(json.urls).toEqual({
|
||||
"https://webrecorder.net/": 200,
|
||||
"https://webrecorder.net/assets/main.css": 200,
|
||||
"https://webrecorder.net/assets/tools/awp-icon.png": 200,
|
||||
"https://webrecorder.net/assets/wr-logo.svg": 200,
|
||||
"https://webrecorder.net/assets/tools/browsertrixcrawler.png": 200,
|
||||
"https://webrecorder.net/assets/tools/logo-pywb.png": 200,
|
||||
"https://webrecorder.net/assets/images/btrix-cloud.png": 200,
|
||||
"https://webrecorder.net/assets/tools/rwp-icon.png": 200,
|
||||
"https://webrecorder.net/assets/fontawesome/all.css": 200,
|
||||
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap": 200,
|
||||
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap": 200,
|
||||
"https://stats.browsertrix.com/js/script.js": 200,
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": 200,
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": 200,
|
||||
"https://webrecorder.net/assets/favicon.ico": 200,
|
||||
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net": 202,
|
||||
"https://webrecorder.net/": {
|
||||
status: 200,
|
||||
mime: "text/html",
|
||||
type: "Document",
|
||||
},
|
||||
"https://webrecorder.net/assets/fontawesome/all.css": {
|
||||
status: 200,
|
||||
mime: "text/css",
|
||||
type: "Stylesheet",
|
||||
},
|
||||
"https://webrecorder.net/assets/wr-logo.svg": {
|
||||
status: 200,
|
||||
mime: "image/svg+xml",
|
||||
type: "Image",
|
||||
},
|
||||
"https://webrecorder.net/assets/tools/awp-icon.png": {
|
||||
status: 200,
|
||||
mime: "image/png",
|
||||
type: "Image",
|
||||
},
|
||||
"https://webrecorder.net/assets/tools/logo-pywb.png": {
|
||||
status: 200,
|
||||
mime: "image/png",
|
||||
type: "Image",
|
||||
},
|
||||
"https://webrecorder.net/assets/tools/browsertrixcrawler.png": {
|
||||
status: 200,
|
||||
mime: "image/png",
|
||||
type: "Image",
|
||||
},
|
||||
"https://webrecorder.net/assets/tools/rwp-icon.png": {
|
||||
status: 200,
|
||||
mime: "image/png",
|
||||
type: "Image",
|
||||
},
|
||||
"https://webrecorder.net/assets/images/btrix-cloud.png": {
|
||||
status: 200,
|
||||
mime: "image/png",
|
||||
type: "Image",
|
||||
},
|
||||
"https://webrecorder.net/assets/main.css": {
|
||||
status: 200,
|
||||
mime: "text/css",
|
||||
type: "Stylesheet",
|
||||
},
|
||||
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
|
||||
{ status: 200, mime: "text/css", type: "Stylesheet" },
|
||||
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
||||
{ status: 200, mime: "text/css", type: "Stylesheet" },
|
||||
"https://stats.browsertrix.com/js/script.js": {
|
||||
status: 200,
|
||||
mime: "application/javascript",
|
||||
type: "Script",
|
||||
},
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "Font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "Font" },
|
||||
"https://webrecorder.net/assets/favicon.ico": {
|
||||
status: 200,
|
||||
mime: "image/vnd.microsoft.icon",
|
||||
type: "Other",
|
||||
},
|
||||
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2F&d=webrecorder.net":
|
||||
{ status: 202, mime: "text/plain", type: "XHR" },
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -77,16 +126,38 @@ function validateResourcesAbout(json) {
|
|||
expect(json).toHaveProperty("ts");
|
||||
expect(json).toHaveProperty("urls");
|
||||
expect(json.urls).toEqual({
|
||||
"https://webrecorder.net/about": 200,
|
||||
"https://webrecorder.net/assets/main.css": 200,
|
||||
"https://webrecorder.net/assets/fontawesome/all.css": 200,
|
||||
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap": 200,
|
||||
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap": 200,
|
||||
"https://stats.browsertrix.com/js/script.js": 200,
|
||||
"https://webrecorder.net/assets/wr-logo.svg": 200,
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": 200,
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": 200,
|
||||
//"https://webrecorder.net/assets/favicon.ico": 200,
|
||||
//"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net": 202,
|
||||
"https://webrecorder.net/about": {
|
||||
status: 200,
|
||||
mime: "text/html",
|
||||
type: "Document",
|
||||
},
|
||||
"https://webrecorder.net/assets/main.css": {
|
||||
status: 200,
|
||||
mime: "text/css",
|
||||
type: "Stylesheet",
|
||||
},
|
||||
"https://webrecorder.net/assets/fontawesome/all.css": {
|
||||
status: 200,
|
||||
mime: "text/css",
|
||||
type: "Stylesheet",
|
||||
},
|
||||
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
||||
{ status: 200, mime: "text/css", type: "Stylesheet" },
|
||||
"https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
|
||||
{ status: 200, mime: "text/css", type: "Stylesheet" },
|
||||
"https://stats.browsertrix.com/js/script.js": {
|
||||
status: 200,
|
||||
mime: "application/javascript",
|
||||
type: "Script",
|
||||
},
|
||||
"https://webrecorder.net/assets/wr-logo.svg": {
|
||||
status: 200,
|
||||
mime: "image/svg+xml",
|
||||
type: "Image",
|
||||
},
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "Font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "Font" },
|
||||
});
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue