From 9a7ac9bef1fb61b8495d555432e524dec6befee6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 28 Feb 2025 17:58:56 -0800 Subject: [PATCH] Fix using cached WACZ filename if already set ahead of time. (#783) - if :nextWacz filename already exists, actually get it and use that! - don't merge cdx if not generating wacz yet, use same condition for both bump version to 1.5.8 - fix follow-up to #748, fix #747 --- package.json | 2 +- src/crawler.ts | 12 ++++++------ src/util/state.ts | 30 ++++++++++++++++++++++++------ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/package.json b/package.json index 40c4624d..9ff86ccb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.5.7", + "version": "1.5.8", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index 16a3ffcd..74fa2cab 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1730,9 +1730,13 @@ self.__bx_behaviors.selectMainBehavior(); await this.combineWARC(); } + const generateFiles = + !this.params.dryRun && + (!this.interruptReason || this.finalExit || this.uploadAndDeleteLocal); + if ( (this.params.generateCDX || this.params.generateWACZ) && - !this.params.dryRun + generateFiles ) { logger.info("Merging CDX"); await this.crawlState.setStatus( @@ -1746,11 +1750,7 @@ self.__bx_behaviors.selectMainBehavior(); ); } - if ( - this.params.generateWACZ && - !this.params.dryRun && - (!this.interruptReason || this.finalExit || this.uploadAndDeleteLocal) - ) { + if (this.params.generateWACZ && generateFiles) { const uploaded = await this.generateWACZ(); if (uploaded && this.uploadAndDeleteLocal) { diff --git a/src/util/state.ts b/src/util/state.ts index e2934b74..60912ade 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -420,12 +420,30 @@ return inx; async setWACZFilename(): Promise { const filename = process.env.STORE_FILENAME || "@ts-@id.wacz"; this.waczFilename = interpolateFilename(filename, this.key); - await this.redis.hsetnx( - `${this.key}:nextWacz`, - this.uid, - this.waczFilename, - ); - return this.waczFilename; + if ( + !(await this.redis.hsetnx( + `${this.key}:nextWacz`, + this.uid, + this.waczFilename, + )) + ) { + this.waczFilename = await this.redis.hget( + `${this.key}:nextWacz`, + this.uid, + ); + logger.debug( + "Keeping WACZ Filename", + { filename: this.waczFilename }, + "state", + ); + } else { + logger.debug( + "Using New WACZ Filename", + { filename: this.waczFilename }, + "state", + ); + } + return this.waczFilename!; } async getWACZFilename(): Promise {