mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
always return wacz, store wacz depends only for current wacz
store crawlid depends for entire crawl
This commit is contained in:
parent
9fba5da0ce
commit
c4f07c4e59
2 changed files with 11 additions and 5 deletions
|
|
@ -1899,12 +1899,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
if (this.deduping) {
|
if (this.deduping) {
|
||||||
await this.crawlState.setStatus("post-crawl");
|
await this.crawlState.setStatus("post-crawl");
|
||||||
await this.crawlState.updateDedupSource(wacz);
|
await this.crawlState.updateDedupSource(wacz);
|
||||||
|
|
||||||
|
await this.crawlState.clearDupeFileRef();
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.crawlState.clearWACZFilename();
|
await this.crawlState.clearWACZFilename();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wacz && this.uploadAndDeleteLocal) {
|
if (wacz && this.storage && this.uploadAndDeleteLocal) {
|
||||||
await this.crawlState.setArchiveSize(0);
|
await this.crawlState.setArchiveSize(0);
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
@ -2033,9 +2035,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const targetFilename = await this.crawlState.getWACZFilename();
|
const targetFilename = await this.crawlState.getWACZFilename();
|
||||||
|
|
||||||
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
|
await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);
|
||||||
|
|
||||||
return wacz;
|
|
||||||
}
|
}
|
||||||
|
return wacz;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Error creating WACZ", e);
|
logger.error("Error creating WACZ", e);
|
||||||
if (!streaming) {
|
if (!streaming) {
|
||||||
|
|
|
||||||
|
|
@ -1341,11 +1341,16 @@ return inx;
|
||||||
|
|
||||||
// DEPENDENT CRAWLS FOR DEDUPE
|
// DEPENDENT CRAWLS FOR DEDUPE
|
||||||
async addDupeCrawlRef(crawlId: string, index: string) {
|
async addDupeCrawlRef(crawlId: string, index: string) {
|
||||||
await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index);
|
await this.redis.sadd(`${this.uid}:dindex`, crawlId + " " + index);
|
||||||
|
await this.redis.sadd(`${this.crawlId}:depCrawls`, crawlId);
|
||||||
|
}
|
||||||
|
|
||||||
|
async clearDupeFileRef() {
|
||||||
|
await this.redis.del(`${this.uid}:dindex`);
|
||||||
}
|
}
|
||||||
|
|
||||||
async getDupeDependentSources() {
|
async getDupeDependentSources() {
|
||||||
const dependIndexes = await this.redis.smembers(`${this.crawlId}:dindex`);
|
const dependIndexes = await this.redis.smembers(`${this.uid}:dindex`);
|
||||||
const crawlIds = [];
|
const crawlIds = [];
|
||||||
for (const value of dependIndexes) {
|
for (const value of dependIndexes) {
|
||||||
const [crawlId, index] = value.split(" ");
|
const [crawlId, index] = value.split(" ");
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue