edge cases: check for page responses which are non-400 / or missing mime, possibly a captcha/sso check,

remove from dupe check to allow recapture
This commit is contained in:
Ilya Kreymer 2025-03-20 19:16:21 -07:00
parent f33d350111
commit 34e1579a42

View file

@ -507,7 +507,7 @@ export class Recorder extends EventEmitter {
return;
}
this.serializeToWARC(reqresp).catch((e) =>
this.serializeToWARC(reqresp, true).catch((e) =>
logger.warn("Error Serializing to WARC", e, "recorder"),
);
}
@ -1327,7 +1327,7 @@ export class Recorder extends EventEmitter {
return reqresp;
}
async serializeToWARC(reqresp: RequestResponseInfo) {
async serializeToWARC(reqresp: RequestResponseInfo, fromFinished = false) {
// always include in pageinfo record if going to serialize to WARC
// even if serialization does not happen
this.addPageRecord(reqresp);
@ -1371,6 +1371,15 @@ export class Recorder extends EventEmitter {
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
this.writer.writeRecordPair(responseRecord, requestRecord);
// edge case: from finished response load, and page response and no mime type or status != 200, possibly a captcha/sso page
// allow it to be captured again
if (
(fromFinished && url === this.pageUrl && !reqresp.getMimeType()) ||
status !== 200
) {
await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
}
}
async directFetchCapture({