cleanup, keep compatibility with redis 6 still

set to 'post-crawl' state after uploading
This commit is contained in:
Ilya Kreymer 2025-10-24 13:24:53 -07:00
parent 61acafd234
commit 105bd06f30
4 changed files with 37 additions and 28 deletions

View file

@ -25,7 +25,7 @@
"fetch-socks": "^1.3.0", "fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0", "get-folder-size": "^4.0.0",
"husky": "^8.0.3", "husky": "^8.0.3",
"ioredis": "^5.8.2", "ioredis": "^5.3.2",
"iso-639-1": "^3.1.5", "iso-639-1": "^3.1.5",
"js-levenshtein": "^1.1.6", "js-levenshtein": "^1.1.6",
"js-yaml": "^4.1.0", "js-yaml": "^4.1.0",

View file

@ -203,6 +203,7 @@ export class Crawler {
| null = null; | null = null;
recording: boolean; recording: boolean;
deduping = false;
constructor() { constructor() {
const args = this.parseArgs(); const args = this.parseArgs();
@ -344,6 +345,8 @@ export class Crawler {
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0"; const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
const dedupRedisUrl = this.params.redisDedupUrl || redisUrl; const dedupRedisUrl = this.params.redisDedupUrl || redisUrl;
this.deduping = dedupRedisUrl !== redisUrl;
if (!redisUrl.startsWith("redis://")) { if (!redisUrl.startsWith("redis://")) {
logger.fatal( logger.fatal(
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported", "stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
@ -1910,10 +1913,20 @@ self.__bx_behaviors.selectMainBehavior();
} }
if (this.params.generateWACZ && generateFiles) { if (this.params.generateWACZ && generateFiles) {
const uploaded = await this.generateWACZ(); const wacz = await this.generateWACZ();
if (uploaded && this.uploadAndDeleteLocal) { if (wacz) {
if (this.deduping) {
await this.crawlState.setStatus("post-crawl");
await this.crawlState.updateDedupSource(wacz);
}
await this.crawlState.clearWACZFilename();
}
if (wacz && this.uploadAndDeleteLocal) {
await this.crawlState.setArchiveSize(0); await this.crawlState.setArchiveSize(0);
logger.info( logger.info(
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`, `Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
); );
@ -1962,7 +1975,7 @@ self.__bx_behaviors.selectMainBehavior();
await streamFinish(logFH); await streamFinish(logFH);
} }
async generateWACZ() { async generateWACZ(): Promise<WACZ | null> {
logger.info("Generating WACZ"); logger.info("Generating WACZ");
await this.crawlState.setStatus("generate-wacz"); await this.crawlState.setStatus("generate-wacz");
@ -1976,11 +1989,11 @@ self.__bx_behaviors.selectMainBehavior();
if (!warcFileList.length) { if (!warcFileList.length) {
// if finished, just return // if finished, just return
if (isFinished || (await this.crawlState.isCrawlCanceled())) { if (isFinished || (await this.crawlState.isCrawlCanceled())) {
return; return null;
} }
// possibly restarted after committing, so assume done here! // possibly restarted after committing, so assume done here!
if ((await this.crawlState.numDone()) > 0) { if ((await this.crawlState.numDone()) > 0) {
return; return null;
} }
// fail crawl otherwise // fail crawl otherwise
logger.fatal("No WARC Files, assuming crawl failed"); logger.fatal("No WARC Files, assuming crawl failed");
@ -2041,16 +2054,8 @@ self.__bx_behaviors.selectMainBehavior();
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished); await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
await this.crawlState.updateDedupSource(wacz); return wacz;
await this.crawlState.clearWACZFilename();
return true;
} else {
await this.crawlState.updateDedupSource(wacz);
} }
return false;
} catch (e) { } catch (e) {
logger.error("Error creating WACZ", e); logger.error("Error creating WACZ", e);
if (!streaming) { if (!streaming) {
@ -2059,6 +2064,8 @@ self.__bx_behaviors.selectMainBehavior();
await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted"); await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted");
} }
} }
return null;
} }
logMemory() { logMemory() {

View file

@ -262,12 +262,13 @@ export class RedisDedupIndex {
async commitDedupDone() { async commitDedupDone() {
for await (const hashes of this.dedupRedis.hscanStream( for await (const hashes of this.dedupRedis.hscanStream(
`h:${this.crawlId}`, `h:${this.crawlId}`,
{
noValues: true,
},
)) { )) {
let value = false;
for (const hash of hashes) { for (const hash of hashes) {
await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId); if (!value) {
await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId);
}
value = !value;
} }
} }
@ -1338,6 +1339,7 @@ return inx;
await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result)); await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result));
} }
// DEPENDENT CRAWLS FOR DEDUPE
async addDupeCrawlRef(crawlId: string, index: string) { async addDupeCrawlRef(crawlId: string, index: string) {
await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index); await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index);
} }

View file

@ -370,10 +370,10 @@
resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3" resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3"
integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA== integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==
"@ioredis/commands@1.4.0": "@ioredis/commands@^1.1.1":
version "1.4.0" version "1.2.0"
resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.4.0.tgz#9f657d51cdd5d2fdb8889592aa4a355546151f25" resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.2.0.tgz#6d61b3097470af1fdbbe622795b8921d42018e11"
integrity sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ== integrity sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg==
"@istanbuljs/load-nyc-config@^1.0.0": "@istanbuljs/load-nyc-config@^1.0.0":
version "1.1.0" version "1.1.0"
@ -3014,12 +3014,12 @@ intl-messageformat@^10.5.3:
"@formatjs/icu-messageformat-parser" "2.11.2" "@formatjs/icu-messageformat-parser" "2.11.2"
tslib "^2.8.0" tslib "^2.8.0"
ioredis@^5.8.2: ioredis@^5.3.2:
version "5.8.2" version "5.4.1"
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.8.2.tgz#c7a228a26cf36f17a5a8011148836877780e2e14" resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.4.1.tgz#1c56b70b759f01465913887375ed809134296f40"
integrity sha512-C6uC+kleiIMmjViJINWk80sOQw5lEzse1ZmvD+S/s8p8CWapftSaC+kocGTx6xrbrJ4WmYQGC08ffHLr6ToR6Q== integrity sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==
dependencies: dependencies:
"@ioredis/commands" "1.4.0" "@ioredis/commands" "^1.1.1"
cluster-key-slot "^1.1.0" cluster-key-slot "^1.1.0"
debug "^4.3.4" debug "^4.3.4"
denque "^2.1.0" denque "^2.1.0"