always return wacz, store wacz depends only for current wacz

store crawlid depends for entire crawl
This commit is contained in:
Ilya Kreymer 2025-10-24 15:01:00 -07:00
parent 9fba5da0ce
commit c4f07c4e59
2 changed files with 11 additions and 5 deletions

View file

@ -1899,12 +1899,14 @@ self.__bx_behaviors.selectMainBehavior();
if (this.deduping) { if (this.deduping) {
await this.crawlState.setStatus("post-crawl"); await this.crawlState.setStatus("post-crawl");
await this.crawlState.updateDedupSource(wacz); await this.crawlState.updateDedupSource(wacz);
await this.crawlState.clearDupeFileRef();
} }
await this.crawlState.clearWACZFilename(); await this.crawlState.clearWACZFilename();
} }
if (wacz && this.uploadAndDeleteLocal) { if (wacz && this.storage && this.uploadAndDeleteLocal) {
await this.crawlState.setArchiveSize(0); await this.crawlState.setArchiveSize(0);
logger.info( logger.info(
@ -2033,9 +2035,8 @@ self.__bx_behaviors.selectMainBehavior();
const targetFilename = await this.crawlState.getWACZFilename(); const targetFilename = await this.crawlState.getWACZFilename();
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished); await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
return wacz;
} }
return wacz;
} catch (e) { } catch (e) {
logger.error("Error creating WACZ", e); logger.error("Error creating WACZ", e);
if (!streaming) { if (!streaming) {

View file

@ -1341,11 +1341,16 @@ return inx;
// DEPENDENT CRAWLS FOR DEDUPE // DEPENDENT CRAWLS FOR DEDUPE
async addDupeCrawlRef(crawlId: string, index: string) { async addDupeCrawlRef(crawlId: string, index: string) {
await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index); await this.redis.sadd(`${this.uid}:dindex`, crawlId + " " + index);
await this.redis.sadd(`${this.crawlId}:depCrawls`, crawlId);
}
async clearDupeFileRef() {
await this.redis.del(`${this.uid}:dindex`);
} }
async getDupeDependentSources() { async getDupeDependentSources() {
const dependIndexes = await this.redis.smembers(`${this.crawlId}:dindex`); const dependIndexes = await this.redis.smembers(`${this.uid}:dindex`);
const crawlIds = []; const crawlIds = [];
for (const value of dependIndexes) { for (const value of dependIndexes) {
const [crawlId, index] = value.split(" "); const [crawlId, index] = value.split(" ");