2023-03-31 12:35:18 -04:00
|
|
|
import child_process from "child_process";
|
2022-10-24 15:30:10 +02:00
|
|
|
import fs from "fs";
|
|
|
|
import fsp from "fs/promises";
|
2023-03-31 12:35:18 -04:00
|
|
|
import util from "util";
|
2022-02-08 15:31:55 -08:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
import os from "os";
|
|
|
|
import { createHash } from "crypto";
|
2023-11-09 11:27:11 -08:00
|
|
|
|
2023-10-20 16:29:07 -07:00
|
|
|
import crc32 from "crc/crc32";
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
import * as Minio from "minio";
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
import { initRedis } from "./redis.js";
|
2023-03-17 14:24:44 -07:00
|
|
|
import { logger } from "./logger.js";
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2024-05-12 12:42:04 -04:00
|
|
|
// @ts-expect-error (incorrect types on get-folder-size)
|
2022-10-24 15:30:10 +02:00
|
|
|
import getFolderSize from "get-folder-size";
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2024-05-12 12:42:04 -04:00
|
|
|
const DEFAULT_REGION = "us-east-1";
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// ===========================================================================
|
2023-11-09 19:11:11 -05:00
|
|
|
export class S3StorageSync {
|
2023-11-09 11:27:11 -08:00
|
|
|
fullPrefix: string;
|
|
|
|
client: Minio.Client;
|
|
|
|
|
|
|
|
bucketName: string;
|
|
|
|
objectPrefix: string;
|
|
|
|
resources: object[] = [];
|
|
|
|
|
|
|
|
userId: string;
|
|
|
|
crawlId: string;
|
|
|
|
webhookUrl?: string;
|
|
|
|
|
|
|
|
// TODO: Fix this the next time the file is edited.
|
2023-11-09 19:11:11 -05:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
constructor(
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
urlOrData: string | any,
|
2023-11-09 19:11:11 -05:00
|
|
|
{
|
|
|
|
webhookUrl,
|
|
|
|
userId,
|
|
|
|
crawlId,
|
|
|
|
}: { webhookUrl?: string; userId: string; crawlId: string },
|
2023-11-09 11:27:11 -08:00
|
|
|
) {
|
2021-11-23 12:53:30 -08:00
|
|
|
let url;
|
|
|
|
let accessKey;
|
|
|
|
let secretKey;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (typeof urlOrData === "string") {
|
2021-11-23 12:53:30 -08:00
|
|
|
url = new URL(urlOrData);
|
|
|
|
accessKey = url.username;
|
|
|
|
secretKey = url.password;
|
|
|
|
url.username = "";
|
|
|
|
url.password = "";
|
|
|
|
this.fullPrefix = url.href;
|
|
|
|
} else {
|
|
|
|
url = new URL(urlOrData.endpointUrl);
|
|
|
|
accessKey = urlOrData.accessKey;
|
|
|
|
secretKey = urlOrData.secretKey;
|
|
|
|
this.fullPrefix = url.href;
|
|
|
|
}
|
|
|
|
|
2024-05-12 12:42:04 -04:00
|
|
|
const region = process.env.STORE_REGION || DEFAULT_REGION;
|
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
this.client = new Minio.Client({
|
|
|
|
endPoint: url.hostname,
|
|
|
|
port: Number(url.port) || (url.protocol === "https:" ? 443 : 80),
|
|
|
|
useSSL: url.protocol === "https:",
|
|
|
|
accessKey,
|
2022-02-08 15:31:55 -08:00
|
|
|
secretKey,
|
2023-11-09 19:11:11 -05:00
|
|
|
partSize: 100 * 1024 * 1024,
|
2024-05-12 12:42:04 -04:00
|
|
|
region,
|
2021-11-23 12:53:30 -08:00
|
|
|
});
|
|
|
|
|
|
|
|
this.bucketName = url.pathname.slice(1).split("/")[0];
|
|
|
|
|
|
|
|
this.objectPrefix = url.pathname.slice(this.bucketName.length + 2);
|
|
|
|
|
|
|
|
this.resources = [];
|
|
|
|
|
|
|
|
this.userId = userId;
|
|
|
|
this.crawlId = crawlId;
|
|
|
|
this.webhookUrl = webhookUrl;
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async uploadFile(srcFilename: string, targetFilename: string) {
|
2022-12-15 12:38:41 -05:00
|
|
|
const fileUploadInfo = {
|
2023-11-09 19:11:11 -05:00
|
|
|
bucket: this.bucketName,
|
|
|
|
crawlId: this.crawlId,
|
|
|
|
prefix: this.objectPrefix,
|
|
|
|
targetFilename,
|
2022-12-15 12:38:41 -05:00
|
|
|
};
|
2023-11-14 21:54:40 -08:00
|
|
|
logger.info("S3 file upload information", fileUploadInfo, "storage");
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
await this.client.fPutObject(
|
|
|
|
this.bucketName,
|
|
|
|
this.objectPrefix + targetFilename,
|
|
|
|
srcFilename,
|
|
|
|
);
|
2022-05-05 14:27:17 -05:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const { hash, crc32 } = await checksumFile("sha256", srcFilename);
|
2023-10-23 18:35:03 -07:00
|
|
|
const path = targetFilename;
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2022-05-05 14:27:17 -05:00
|
|
|
const size = await getFileSize(srcFilename);
|
2023-10-23 18:35:03 -07:00
|
|
|
|
|
|
|
// for backwards compatibility, keep 'bytes'
|
2023-11-09 19:11:11 -05:00
|
|
|
return { path, size, hash, crc32, bytes: size };
|
2022-05-05 14:27:17 -05:00
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async downloadFile(srcFilename: string, destFilename: string) {
|
2023-11-09 19:11:11 -05:00
|
|
|
await this.client.fGetObject(
|
|
|
|
this.bucketName,
|
|
|
|
this.objectPrefix + srcFilename,
|
|
|
|
destFilename,
|
|
|
|
);
|
2022-05-05 14:27:17 -05:00
|
|
|
}
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
async uploadCollWACZ(
|
|
|
|
srcFilename: string,
|
|
|
|
targetFilename: string,
|
|
|
|
completed = true,
|
|
|
|
) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
const resource = await this.uploadFile(srcFilename, targetFilename);
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.info(
|
|
|
|
"WACZ S3 file upload resource",
|
|
|
|
{ targetFilename, resource },
|
2023-11-14 21:54:40 -08:00
|
|
|
"storage",
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
2021-11-23 12:53:30 -08:00
|
|
|
|
|
|
|
if (this.webhookUrl) {
|
|
|
|
const body = {
|
|
|
|
id: this.crawlId,
|
|
|
|
user: this.userId,
|
|
|
|
|
|
|
|
//filename: `s3://${this.bucketName}/${this.objectPrefix}${this.waczFilename}`,
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
filename: this.fullPrefix + targetFilename,
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2023-10-20 16:29:07 -07:00
|
|
|
...resource,
|
2023-11-09 19:11:11 -05:00
|
|
|
completed,
|
2021-11-23 12:53:30 -08:00
|
|
|
};
|
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
this.webhookUrl.startsWith("http://") ||
|
|
|
|
this.webhookUrl.startsWith("https://")
|
|
|
|
) {
|
|
|
|
await fetch(this.webhookUrl, {
|
|
|
|
method: "POST",
|
|
|
|
body: JSON.stringify(body),
|
|
|
|
});
|
2021-11-23 12:53:30 -08:00
|
|
|
} else if (this.webhookUrl.startsWith("redis://")) {
|
|
|
|
const parts = this.webhookUrl.split("/");
|
|
|
|
if (parts.length !== 5) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.fatal(
|
|
|
|
"redis webhook url must be in format: redis://<host>:<port>/<db>/<key>",
|
2023-11-14 21:54:40 -08:00
|
|
|
{},
|
|
|
|
"redis",
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
2021-11-23 12:53:30 -08:00
|
|
|
}
|
|
|
|
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
|
|
|
await redis.rpush(parts[4], JSON.stringify(body));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export function initStorage() {
|
2022-05-05 14:27:17 -05:00
|
|
|
if (!process.env.STORE_ENDPOINT_URL) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const endpointUrl =
|
|
|
|
process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
2022-05-05 14:27:17 -05:00
|
|
|
const storeInfo = {
|
|
|
|
endpointUrl,
|
|
|
|
accessKey: process.env.STORE_ACCESS_KEY,
|
|
|
|
secretKey: process.env.STORE_SECRET_KEY,
|
|
|
|
};
|
|
|
|
|
|
|
|
const opts = {
|
|
|
|
crawlId: process.env.CRAWL_ID || os.hostname(),
|
2023-11-09 11:27:11 -08:00
|
|
|
webhookUrl: process.env.WEBHOOK_URL || "",
|
|
|
|
userId: process.env.STORE_USER || "",
|
2022-05-05 14:27:17 -05:00
|
|
|
};
|
|
|
|
|
2023-01-23 10:43:12 -08:00
|
|
|
logger.info("Initing Storage...");
|
2022-05-05 14:27:17 -05:00
|
|
|
return new S3StorageSync(storeInfo, opts);
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
export async function getFileSize(filename: string) {
|
2022-02-08 15:31:55 -08:00
|
|
|
const stats = await fsp.stat(filename);
|
|
|
|
return stats.size;
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
export async function getDirSize(dir: string) {
|
2023-03-09 09:00:14 -08:00
|
|
|
const { size, errors } = await getFolderSize(dir);
|
|
|
|
if (errors && errors.length) {
|
2023-11-14 21:54:40 -08:00
|
|
|
logger.warn("Size check errors", { errors }, "storage");
|
2023-03-09 09:00:14 -08:00
|
|
|
}
|
|
|
|
return size;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
export async function checkDiskUtilization(
|
2024-06-07 19:13:15 +02:00
|
|
|
collDir: string,
|
2023-11-09 19:11:11 -05:00
|
|
|
// TODO: Fix this the next time the file is edited.
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
params: Record<string, any>,
|
|
|
|
archiveDirSize: number,
|
|
|
|
dfOutput = null,
|
2024-06-26 13:05:13 -07:00
|
|
|
doLog = true,
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
|
|
|
const diskUsage: Record<string, string> = await getDiskUsage(
|
2024-06-07 19:13:15 +02:00
|
|
|
collDir,
|
2023-11-09 19:11:11 -05:00
|
|
|
dfOutput,
|
|
|
|
);
|
2023-07-06 00:58:28 -04:00
|
|
|
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
|
|
|
|
|
|
|
// Check that disk usage isn't already above threshold
|
|
|
|
if (usedPercentage >= params.diskUtilization) {
|
2024-06-26 13:05:13 -07:00
|
|
|
if (doLog) {
|
|
|
|
logger.info(
|
|
|
|
`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`,
|
|
|
|
);
|
|
|
|
}
|
2023-07-06 00:58:28 -04:00
|
|
|
return {
|
|
|
|
stop: true,
|
|
|
|
used: usedPercentage,
|
|
|
|
projected: null,
|
2023-11-09 19:11:11 -05:00
|
|
|
threshold: params.diskUtilization,
|
2023-07-06 00:58:28 -04:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that disk usage isn't likely to cross threshold
|
|
|
|
const kbUsed = parseInt(diskUsage["Used"]);
|
|
|
|
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
let kbArchiveDirSize = Math.round(archiveDirSize / 1024);
|
2023-07-06 00:58:28 -04:00
|
|
|
if (params.combineWARC && params.generateWACZ) {
|
|
|
|
kbArchiveDirSize *= 4;
|
|
|
|
} else if (params.combineWARC || params.generateWACZ) {
|
|
|
|
kbArchiveDirSize *= 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
const projectedTotal = kbUsed + kbArchiveDirSize;
|
2023-11-09 19:11:11 -05:00
|
|
|
const projectedUsedPercentage = calculatePercentageUsed(
|
|
|
|
projectedTotal,
|
|
|
|
kbTotal,
|
|
|
|
);
|
2023-07-06 00:58:28 -04:00
|
|
|
|
|
|
|
if (projectedUsedPercentage >= params.diskUtilization) {
|
2024-06-26 13:05:13 -07:00
|
|
|
if (doLog) {
|
|
|
|
logger.info(
|
|
|
|
`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`,
|
|
|
|
);
|
|
|
|
}
|
2023-07-06 00:58:28 -04:00
|
|
|
return {
|
|
|
|
stop: true,
|
|
|
|
used: usedPercentage,
|
|
|
|
projected: projectedUsedPercentage,
|
2023-11-09 19:11:11 -05:00
|
|
|
threshold: params.diskUtilization,
|
2023-07-06 00:58:28 -04:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
stop: false,
|
|
|
|
used: usedPercentage,
|
|
|
|
projected: projectedUsedPercentage,
|
2023-11-09 19:11:11 -05:00
|
|
|
threshold: params.diskUtilization,
|
2023-07-06 00:58:28 -04:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
export async function getDFOutput(path: string) {
|
2023-03-31 12:35:18 -04:00
|
|
|
const exec = util.promisify(child_process.exec);
|
2023-07-06 00:58:28 -04:00
|
|
|
const res = await exec(`df ${path}`);
|
|
|
|
return res.stdout;
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
export async function getDiskUsage(path = "/crawls", dfOutput = null) {
|
2023-11-09 11:27:11 -08:00
|
|
|
const result = dfOutput || (await getDFOutput(path));
|
2023-07-06 00:58:28 -04:00
|
|
|
const lines = result.split("\n");
|
2023-11-09 19:11:11 -05:00
|
|
|
const keys = lines[0].split(/\s+/gi);
|
|
|
|
const rows = lines.slice(1).map((line) => {
|
|
|
|
const values = line.split(/\s+/gi);
|
2023-11-09 11:27:11 -08:00
|
|
|
// TODO: Fix this the next time the file is edited.
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
return keys.reduce((o: Record<string, any>, k, index) => {
|
2023-03-31 12:35:18 -04:00
|
|
|
o[k] = values[index];
|
|
|
|
return o;
|
|
|
|
}, {});
|
|
|
|
});
|
|
|
|
return rows[0];
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
export function calculatePercentageUsed(used: number, total: number) {
|
2023-11-09 19:11:11 -05:00
|
|
|
return Math.round((used / total) * 100);
|
2023-07-06 00:58:28 -04:00
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
function checksumFile(
|
|
|
|
hashName: string,
|
|
|
|
path: string,
|
|
|
|
): Promise<{ hash: string; crc32: number }> {
|
2022-02-08 15:31:55 -08:00
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const hash = createHash(hashName);
|
2023-11-09 19:11:11 -05:00
|
|
|
let crc: number = 0;
|
2023-10-20 16:29:07 -07:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
const stream = fs.createReadStream(path);
|
2023-11-09 19:11:11 -05:00
|
|
|
stream.on("error", (err) => reject(err));
|
2023-10-20 16:29:07 -07:00
|
|
|
stream.on("data", (chunk) => {
|
|
|
|
hash.update(chunk);
|
|
|
|
crc = crc32(chunk, crc);
|
|
|
|
});
|
2023-11-09 19:11:11 -05:00
|
|
|
stream.on("end", () => resolve({ hash: hash.digest("hex"), crc32: crc }));
|
2022-02-08 15:31:55 -08:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
export function interpolateFilename(filename: string, crawlId: string) {
|
2023-11-09 19:11:11 -05:00
|
|
|
filename = filename.replace(
|
|
|
|
"@ts",
|
|
|
|
new Date().toISOString().replace(/[:TZz.-]/g, ""),
|
|
|
|
);
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
filename = filename.replace("@hostname", os.hostname());
|
|
|
|
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
|
|
|
filename = filename.replace("@id", crawlId);
|
|
|
|
return filename;
|
|
|
|
}
|