cleanup, keep compatibility with redis 6 still

set to 'post-crawl' state after uploading
This commit is contained in:
Ilya Kreymer 2025-10-24 13:24:53 -07:00
parent 61acafd234
commit 105bd06f30
4 changed files with 37 additions and 28 deletions

View file

@ -25,7 +25,7 @@
"fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",
"ioredis": "^5.8.2",
"ioredis": "^5.3.2",
"iso-639-1": "^3.1.5",
"js-levenshtein": "^1.1.6",
"js-yaml": "^4.1.0",

View file

@ -203,6 +203,7 @@ export class Crawler {
| null = null;
recording: boolean;
deduping = false;
constructor() {
const args = this.parseArgs();
@ -344,6 +345,8 @@ export class Crawler {
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
const dedupRedisUrl = this.params.redisDedupUrl || redisUrl;
this.deduping = dedupRedisUrl !== redisUrl;
if (!redisUrl.startsWith("redis://")) {
logger.fatal(
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
@ -1910,10 +1913,20 @@ self.__bx_behaviors.selectMainBehavior();
}
if (this.params.generateWACZ && generateFiles) {
const uploaded = await this.generateWACZ();
const wacz = await this.generateWACZ();
if (uploaded && this.uploadAndDeleteLocal) {
if (wacz) {
if (this.deduping) {
await this.crawlState.setStatus("post-crawl");
await this.crawlState.updateDedupSource(wacz);
}
await this.crawlState.clearWACZFilename();
}
if (wacz && this.uploadAndDeleteLocal) {
await this.crawlState.setArchiveSize(0);
logger.info(
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
);
@ -1962,7 +1975,7 @@ self.__bx_behaviors.selectMainBehavior();
await streamFinish(logFH);
}
async generateWACZ() {
async generateWACZ(): Promise<WACZ | null> {
logger.info("Generating WACZ");
await this.crawlState.setStatus("generate-wacz");
@ -1976,11 +1989,11 @@ self.__bx_behaviors.selectMainBehavior();
if (!warcFileList.length) {
// if finished, just return
if (isFinished || (await this.crawlState.isCrawlCanceled())) {
return;
return null;
}
// possibly restarted after committing, so assume done here!
if ((await this.crawlState.numDone()) > 0) {
return;
return null;
}
// fail crawl otherwise
logger.fatal("No WARC Files, assuming crawl failed");
@ -2041,16 +2054,8 @@ self.__bx_behaviors.selectMainBehavior();
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
await this.crawlState.updateDedupSource(wacz);
await this.crawlState.clearWACZFilename();
return true;
} else {
await this.crawlState.updateDedupSource(wacz);
return wacz;
}
return false;
} catch (e) {
logger.error("Error creating WACZ", e);
if (!streaming) {
@ -2059,6 +2064,8 @@ self.__bx_behaviors.selectMainBehavior();
await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted");
}
}
return null;
}
logMemory() {

View file

@ -262,13 +262,14 @@ export class RedisDedupIndex {
async commitDedupDone() {
for await (const hashes of this.dedupRedis.hscanStream(
`h:${this.crawlId}`,
{
noValues: true,
},
)) {
let value = false;
for (const hash of hashes) {
if (!value) {
await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId);
}
value = !value;
}
}
// add to crawls list
@ -1338,6 +1339,7 @@ return inx;
await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result));
}
// DEPENDENT CRAWLS FOR DEDUPE
async addDupeCrawlRef(crawlId: string, index: string) {
await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index);
}

View file

@ -370,10 +370,10 @@
resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3"
integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==
"@ioredis/commands@1.4.0":
version "1.4.0"
resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.4.0.tgz#9f657d51cdd5d2fdb8889592aa4a355546151f25"
integrity sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ==
"@ioredis/commands@^1.1.1":
version "1.2.0"
resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.2.0.tgz#6d61b3097470af1fdbbe622795b8921d42018e11"
integrity sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg==
"@istanbuljs/load-nyc-config@^1.0.0":
version "1.1.0"
@ -3014,12 +3014,12 @@ intl-messageformat@^10.5.3:
"@formatjs/icu-messageformat-parser" "2.11.2"
tslib "^2.8.0"
ioredis@^5.8.2:
version "5.8.2"
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.8.2.tgz#c7a228a26cf36f17a5a8011148836877780e2e14"
integrity sha512-C6uC+kleiIMmjViJINWk80sOQw5lEzse1ZmvD+S/s8p8CWapftSaC+kocGTx6xrbrJ4WmYQGC08ffHLr6ToR6Q==
ioredis@^5.3.2:
version "5.4.1"
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.4.1.tgz#1c56b70b759f01465913887375ed809134296f40"
integrity sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==
dependencies:
"@ioredis/commands" "1.4.0"
"@ioredis/commands" "^1.1.1"
cluster-key-slot "^1.1.0"
debug "^4.3.4"
denque "^2.1.0"