mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
skipping resources: ensure HEAD, OPTIONS, 204, 206, and 304 response/request pairs are not written to WARC
This commit is contained in:
parent
2fc0f67f04
commit
bc201be7f1
2 changed files with 21 additions and 2 deletions
|
@ -912,8 +912,14 @@ export class Recorder {
|
||||||
}
|
}
|
||||||
|
|
||||||
async serializeToWARC(reqresp: RequestResponseInfo) {
|
async serializeToWARC(reqresp: RequestResponseInfo) {
|
||||||
if (!reqresp.payload) {
|
if (reqresp.shouldSkip()) {
|
||||||
logNetwork("Not writing, no payload", { url: reqresp.url });
|
const { url, method, status, payload } = reqresp;
|
||||||
|
logNetwork("Skipping request/response", {
|
||||||
|
url,
|
||||||
|
method,
|
||||||
|
status,
|
||||||
|
payloadLength: payload && payload.length,
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -270,6 +270,19 @@ export class RequestResponseInfo {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
shouldSkip() {
|
||||||
|
// skip OPTIONS/HEAD responses, and 304, 204 or 206 responses
|
||||||
|
if (
|
||||||
|
!this.payload ||
|
||||||
|
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
|
||||||
|
[204, 206, 304].includes(this.status)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
getCanonURL(): string {
|
getCanonURL(): string {
|
||||||
if (!this.method || this.method === "GET") {
|
if (!this.method || this.method === "GET") {
|
||||||
return this.url;
|
return this.url;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue