mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
keep skipping dupe URLs as before
This commit is contained in:
parent
2ecf290d38
commit
87c94876f6
2 changed files with 28 additions and 35 deletions
|
|
@ -21,13 +21,7 @@ import {
|
|||
import { WARCRecord, multiValueHeader } from "warcio";
|
||||
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
||||
import { WARCWriter } from "./warcwriter.js";
|
||||
import {
|
||||
LoadState,
|
||||
normalizeDedupStatus,
|
||||
PageState,
|
||||
RedisCrawlState,
|
||||
WorkerId,
|
||||
} from "./state.js";
|
||||
import { LoadState, PageState, RedisCrawlState, WorkerId } from "./state.js";
|
||||
import { CDPSession, Protocol } from "puppeteer-core";
|
||||
import { Crawler } from "../crawler.js";
|
||||
import { getProxyDispatcher } from "./proxy.js";
|
||||
|
|
@ -1512,11 +1506,7 @@ export class Recorder extends EventEmitter {
|
|||
if (
|
||||
method === "GET" &&
|
||||
url &&
|
||||
!(await this.crawlState.addIfNoDupe(
|
||||
ASYNC_FETCH_DUPE_KEY,
|
||||
url,
|
||||
normalizeDedupStatus(status),
|
||||
))
|
||||
!(await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status))
|
||||
) {
|
||||
reqresp.asyncLoading = false;
|
||||
return true;
|
||||
|
|
@ -1621,15 +1611,19 @@ export class Recorder extends EventEmitter {
|
|||
return false;
|
||||
}
|
||||
|
||||
// if (
|
||||
// url &&
|
||||
// method === "GET" &&
|
||||
// !isRedirectStatus(status) &&
|
||||
// !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
|
||||
// ) {
|
||||
// logNetwork("Skipping exact URL dupe in this crawl", { url, status, ...this.logDetails });
|
||||
// return false;
|
||||
// }
|
||||
if (
|
||||
url &&
|
||||
method === "GET" &&
|
||||
!isRedirectStatus(status) &&
|
||||
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
|
||||
) {
|
||||
logNetwork("Skipping exact URL dupe in this crawl", {
|
||||
url,
|
||||
status,
|
||||
...this.logDetails,
|
||||
});
|
||||
return false;
|
||||
}
|
||||
|
||||
let responseRecord = createResponse(reqresp, pageid, iter);
|
||||
const requestRecord = createRequest(reqresp, responseRecord, pageid);
|
||||
|
|
@ -1644,12 +1638,8 @@ export class Recorder extends EventEmitter {
|
|||
!(await this.checkStreamingRecordPayload(reqresp, serializer, false))
|
||||
) {
|
||||
serializer.externalBuffer?.purge();
|
||||
await this.crawlState.removeDupe(
|
||||
ASYNC_FETCH_DUPE_KEY,
|
||||
url,
|
||||
normalizeDedupStatus(status),
|
||||
);
|
||||
//await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
|
||||
await this.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
|
||||
await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -1683,10 +1673,10 @@ export class Recorder extends EventEmitter {
|
|||
|
||||
const hash = responseRecord.warcPayloadDigest || "";
|
||||
|
||||
if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) {
|
||||
serializer.externalBuffer?.purge();
|
||||
return false;
|
||||
}
|
||||
// if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) {
|
||||
// serializer.externalBuffer?.purge();
|
||||
// return false;
|
||||
// }
|
||||
|
||||
const date = responseRecord.warcDate || "";
|
||||
|
||||
|
|
|
|||
|
|
@ -1044,12 +1044,15 @@ return inx;
|
|||
return await this.redis.zcard(this.qkey);
|
||||
}
|
||||
|
||||
async addIfNoDupe(key: string, url: string, other_id: string) {
|
||||
return (await this.redis.sadd(key, other_id + "|" + url)) === 1;
|
||||
async addIfNoDupe(key: string, url: string, status: number) {
|
||||
return (
|
||||
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
async removeDupe(key: string, url: string, other_id: string) {
|
||||
return await this.redis.srem(key, other_id + "|" + url);
|
||||
async removeDupe(key: string, url: string, status: number) {
|
||||
return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url);
|
||||
}
|
||||
|
||||
async isInUserSet(value: string) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue