From 105bd06f30ee95e15103a9ceadc341d1dcc966e0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 24 Oct 2025 13:24:53 -0700 Subject: [PATCH] cleanup, keep compatibility with redis 6 still set to 'post-crawl' state after uploading --- package.json | 2 +- src/crawler.ts | 35 +++++++++++++++++++++-------------- src/util/state.ts | 10 ++++++---- yarn.lock | 18 +++++++++--------- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/package.json b/package.json index 572c960a..2fcf16b7 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", - "ioredis": "^5.8.2", + "ioredis": "^5.3.2", "iso-639-1": "^3.1.5", "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", diff --git a/src/crawler.ts b/src/crawler.ts index 189275fc..39070146 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -203,6 +203,7 @@ export class Crawler { | null = null; recording: boolean; + deduping = false; constructor() { const args = this.parseArgs(); @@ -344,6 +345,8 @@ export class Crawler { const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0"; const dedupRedisUrl = this.params.redisDedupUrl || redisUrl; + this.deduping = dedupRedisUrl !== redisUrl; + if (!redisUrl.startsWith("redis://")) { logger.fatal( "stateStoreUrl must start with redis:// -- Only redis-based store currently supported", @@ -1910,10 +1913,20 @@ self.__bx_behaviors.selectMainBehavior(); } if (this.params.generateWACZ && generateFiles) { - const uploaded = await this.generateWACZ(); + const wacz = await this.generateWACZ(); - if (uploaded && this.uploadAndDeleteLocal) { + if (wacz) { + if (this.deduping) { + await this.crawlState.setStatus("post-crawl"); + await this.crawlState.updateDedupSource(wacz); + } + + await this.crawlState.clearWACZFilename(); + } + + if (wacz && this.uploadAndDeleteLocal) { await this.crawlState.setArchiveSize(0); + logger.info( `Uploaded WACZ, deleting local data to free up space: ${this.collDir}`, ); @@ -1962,7 +1975,7 @@ self.__bx_behaviors.selectMainBehavior(); await streamFinish(logFH); } - async generateWACZ() { + async generateWACZ(): Promise { logger.info("Generating WACZ"); await this.crawlState.setStatus("generate-wacz"); @@ -1976,11 +1989,11 @@ self.__bx_behaviors.selectMainBehavior(); if (!warcFileList.length) { // if finished, just return if (isFinished || (await this.crawlState.isCrawlCanceled())) { - return; + return null; } // possibly restarted after committing, so assume done here! if ((await this.crawlState.numDone()) > 0) { - return; + return null; } // fail crawl otherwise logger.fatal("No WARC Files, assuming crawl failed"); @@ -2041,16 +2054,8 @@ self.__bx_behaviors.selectMainBehavior(); await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished); - await this.crawlState.updateDedupSource(wacz); - - await this.crawlState.clearWACZFilename(); - - return true; - } else { - await this.crawlState.updateDedupSource(wacz); + return wacz; } - - return false; } catch (e) { logger.error("Error creating WACZ", e); if (!streaming) { @@ -2059,6 +2064,8 @@ self.__bx_behaviors.selectMainBehavior(); await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted"); } } + + return null; } logMemory() { diff --git a/src/util/state.ts b/src/util/state.ts index d24205be..b1826013 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -262,12 +262,13 @@ export class RedisDedupIndex { async commitDedupDone() { for await (const hashes of this.dedupRedis.hscanStream( `h:${this.crawlId}`, - { - noValues: true, - }, )) { + let value = false; for (const hash of hashes) { - await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId); + if (!value) { + await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId); + } + value = !value; } } @@ -1338,6 +1339,7 @@ return inx; await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result)); } + // DEPENDENT CRAWLS FOR DEDUPE async addDupeCrawlRef(crawlId: string, index: string) { await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index); } diff --git a/yarn.lock b/yarn.lock index 55cbb71b..94214f3f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -370,10 +370,10 @@ resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3" integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA== -"@ioredis/commands@1.4.0": - version "1.4.0" - resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.4.0.tgz#9f657d51cdd5d2fdb8889592aa4a355546151f25" - integrity sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ== +"@ioredis/commands@^1.1.1": + version "1.2.0" + resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.2.0.tgz#6d61b3097470af1fdbbe622795b8921d42018e11" + integrity sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg== "@istanbuljs/load-nyc-config@^1.0.0": version "1.1.0" @@ -3014,12 +3014,12 @@ intl-messageformat@^10.5.3: "@formatjs/icu-messageformat-parser" "2.11.2" tslib "^2.8.0" -ioredis@^5.8.2: - version "5.8.2" - resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.8.2.tgz#c7a228a26cf36f17a5a8011148836877780e2e14" - integrity sha512-C6uC+kleiIMmjViJINWk80sOQw5lEzse1ZmvD+S/s8p8CWapftSaC+kocGTx6xrbrJ4WmYQGC08ffHLr6ToR6Q== +ioredis@^5.3.2: + version "5.4.1" + resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.4.1.tgz#1c56b70b759f01465913887375ed809134296f40" + integrity sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA== dependencies: - "@ioredis/commands" "1.4.0" + "@ioredis/commands" "^1.1.1" cluster-key-slot "^1.1.0" debug "^4.3.4" denque "^2.1.0"