mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
edge cases: check for page responses which are non-400 / or missing mime, possibly a captcha/sso check,
remove from dupe check to allow recapture
This commit is contained in:
parent
f33d350111
commit
34e1579a42
1 changed files with 11 additions and 2 deletions
|
@ -507,7 +507,7 @@ export class Recorder extends EventEmitter {
|
|||
return;
|
||||
}
|
||||
|
||||
this.serializeToWARC(reqresp).catch((e) =>
|
||||
this.serializeToWARC(reqresp, true).catch((e) =>
|
||||
logger.warn("Error Serializing to WARC", e, "recorder"),
|
||||
);
|
||||
}
|
||||
|
@ -1327,7 +1327,7 @@ export class Recorder extends EventEmitter {
|
|||
return reqresp;
|
||||
}
|
||||
|
||||
async serializeToWARC(reqresp: RequestResponseInfo) {
|
||||
async serializeToWARC(reqresp: RequestResponseInfo, fromFinished = false) {
|
||||
// always include in pageinfo record if going to serialize to WARC
|
||||
// even if serialization does not happen
|
||||
this.addPageRecord(reqresp);
|
||||
|
@ -1371,6 +1371,15 @@ export class Recorder extends EventEmitter {
|
|||
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
|
||||
|
||||
this.writer.writeRecordPair(responseRecord, requestRecord);
|
||||
|
||||
// edge case: from finished response load, and page response and no mime type or status != 200, possibly a captcha/sso page
|
||||
// allow it to be captured again
|
||||
if (
|
||||
(fromFinished && url === this.pageUrl && !reqresp.getMimeType()) ||
|
||||
status !== 200
|
||||
) {
|
||||
await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
|
||||
}
|
||||
}
|
||||
|
||||
async directFetchCapture({
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue