mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
skipping resources: ensure HEAD, OPTIONS, 206, and 304 response/request pairs are not written to WARC (#460)
Allows for skipping network traffic that doesn't need to be stored, as it is not necessary/will result in incorrect replay (eg. 304 instead of a 200).
This commit is contained in:
parent
2fc0f67f04
commit
18ffb3d971
2 changed files with 21 additions and 2 deletions
|
@ -912,8 +912,14 @@ export class Recorder {
|
|||
}
|
||||
|
||||
async serializeToWARC(reqresp: RequestResponseInfo) {
|
||||
if (!reqresp.payload) {
|
||||
logNetwork("Not writing, no payload", { url: reqresp.url });
|
||||
if (reqresp.shouldSkipSave()) {
|
||||
const { url, method, status, payload } = reqresp;
|
||||
logNetwork("Skipping request/response", {
|
||||
url,
|
||||
method,
|
||||
status,
|
||||
payloadLength: payload && payload.length,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -270,6 +270,19 @@ export class RequestResponseInfo {
|
|||
return true;
|
||||
}
|
||||
|
||||
shouldSkipSave() {
|
||||
// skip OPTIONS/HEAD responses, and 304 or 206 responses
|
||||
if (
|
||||
!this.payload ||
|
||||
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
|
||||
[206, 304].includes(this.status)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
getCanonURL(): string {
|
||||
if (!this.method || this.method === "GET") {
|
||||
return this.url;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue