profile download improvements:

- log when profie download starts
- ensure there is a timeout to profile download attempt (60 secs)
- attempt retry 2 more times if initial profile download times out
- fail crawl after 3 retries, if profile can not be downloaded successfully

bumpt to 1.8.2
This commit is contained in:
Ilya Kreymer 2025-10-16 12:40:26 -07:00
parent 6f26148a9b
commit e58680dab4
2 changed files with 37 additions and 6 deletions

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.8.1",
"version": "1.8.2",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -15,9 +15,13 @@ import { logger } from "./logger.js";
import getFolderSize from "get-folder-size";
import { WACZ } from "./wacz.js";
import { sleep, timedRun } from "./timing.js";
import { DEFAULT_MAX_RETRIES, ExitCodes } from "./constants.js";
const DEFAULT_REGION = "us-east-1";
const DOWNLOAD_PROFILE_MAX_TIME = 60;
// ===========================================================================
export class S3StorageSync {
fullPrefix: string;
@ -134,11 +138,38 @@ export class S3StorageSync {
}
async downloadFile(srcFilename: string, destFilename: string) {
await this.client.fGetObject(
this.bucketName,
this.objectPrefix + srcFilename,
destFilename,
);
let count = 0;
logger.debug("Downloading profile", { srcFilename }, "storage");
while (true) {
try {
await timedRun(
this.client.fGetObject(
this.bucketName,
this.objectPrefix + srcFilename,
destFilename,
),
DOWNLOAD_PROFILE_MAX_TIME,
"Timeout out downloading profile",
{},
"storage",
true,
);
break;
} catch (e) {
if (count <= DEFAULT_MAX_RETRIES) {
count += 1;
await sleep(5);
logger.warn("Retry downloading profile", {}, "storage");
} else {
logger.fatal(
"Could not download profile, exiting",
{},
"storage",
ExitCodes.Failed,
);
}
}
}
}
async uploadCollWACZ(