always return wacz, store wacz depends only for current wacz

store crawlid depends for entire crawl
This commit is contained in:
Ilya Kreymer 2025-10-24 15:01:00 -07:00
parent 9fba5da0ce
commit c4f07c4e59
2 changed files with 11 additions and 5 deletions

View file

@ -1899,12 +1899,14 @@ self.__bx_behaviors.selectMainBehavior();
if (this.deduping) {
await this.crawlState.setStatus("post-crawl");
await this.crawlState.updateDedupSource(wacz);
await this.crawlState.clearDupeFileRef();
}
await this.crawlState.clearWACZFilename();
}
if (wacz && this.uploadAndDeleteLocal) {
if (wacz && this.storage && this.uploadAndDeleteLocal) {
await this.crawlState.setArchiveSize(0);
logger.info(
@ -2033,9 +2035,8 @@ self.__bx_behaviors.selectMainBehavior();
const targetFilename = await this.crawlState.getWACZFilename();
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
return wacz;
}
return wacz;
} catch (e) {
logger.error("Error creating WACZ", e);
if (!streaming) {

View file

@ -1341,11 +1341,16 @@ return inx;
// DEPENDENT CRAWLS FOR DEDUPE
async addDupeCrawlRef(crawlId: string, index: string) {
await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index);
await this.redis.sadd(`${this.uid}:dindex`, crawlId + " " + index);
await this.redis.sadd(`${this.crawlId}:depCrawls`, crawlId);
}
async clearDupeFileRef() {
await this.redis.del(`${this.uid}:dindex`);
}
async getDupeDependentSources() {
const dependIndexes = await this.redis.smembers(`${this.crawlId}:dindex`);
const dependIndexes = await this.redis.smembers(`${this.uid}:dindex`);
const crawlIds = [];
for (const value of dependIndexes) {
const [crawlId, index] = value.split(" ");