keep skipping dupe URLs as before

This commit is contained in:
Ilya Kreymer 2025-09-17 20:02:01 -07:00
parent 2ecf290d38
commit 87c94876f6
2 changed files with 28 additions and 35 deletions

View file

@ -21,13 +21,7 @@ import {
import { WARCRecord, multiValueHeader } from "warcio";
import { TempFileBuffer, WARCSerializer } from "warcio/node";
import { WARCWriter } from "./warcwriter.js";
import {
LoadState,
normalizeDedupStatus,
PageState,
RedisCrawlState,
WorkerId,
} from "./state.js";
import { LoadState, PageState, RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core";
import { Crawler } from "../crawler.js";
import { getProxyDispatcher } from "./proxy.js";
@ -1512,11 +1506,7 @@ export class Recorder extends EventEmitter {
if (
method === "GET" &&
url &&
!(await this.crawlState.addIfNoDupe(
ASYNC_FETCH_DUPE_KEY,
url,
normalizeDedupStatus(status),
))
!(await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status))
) {
reqresp.asyncLoading = false;
return true;
@ -1621,15 +1611,19 @@ export class Recorder extends EventEmitter {
return false;
}
// if (
// url &&
// method === "GET" &&
// !isRedirectStatus(status) &&
// !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
// ) {
// logNetwork("Skipping exact URL dupe in this crawl", { url, status, ...this.logDetails });
// return false;
// }
if (
url &&
method === "GET" &&
!isRedirectStatus(status) &&
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
) {
logNetwork("Skipping exact URL dupe in this crawl", {
url,
status,
...this.logDetails,
});
return false;
}
let responseRecord = createResponse(reqresp, pageid, iter);
const requestRecord = createRequest(reqresp, responseRecord, pageid);
@ -1644,12 +1638,8 @@ export class Recorder extends EventEmitter {
!(await this.checkStreamingRecordPayload(reqresp, serializer, false))
) {
serializer.externalBuffer?.purge();
await this.crawlState.removeDupe(
ASYNC_FETCH_DUPE_KEY,
url,
normalizeDedupStatus(status),
);
//await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
await this.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
return false;
}
@ -1683,10 +1673,10 @@ export class Recorder extends EventEmitter {
const hash = responseRecord.warcPayloadDigest || "";
if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) {
serializer.externalBuffer?.purge();
return false;
}
// if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) {
// serializer.externalBuffer?.purge();
// return false;
// }
const date = responseRecord.warcDate || "";

View file

@ -1044,12 +1044,15 @@ return inx;
return await this.redis.zcard(this.qkey);
}
async addIfNoDupe(key: string, url: string, other_id: string) {
return (await this.redis.sadd(key, other_id + "|" + url)) === 1;
async addIfNoDupe(key: string, url: string, status: number) {
return (
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
1
);
}
async removeDupe(key: string, url: string, other_id: string) {
return await this.redis.srem(key, other_id + "|" + url);
async removeDupe(key: string, url: string, status: number) {
return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url);
}
async isInUserSet(value: string) {