2021-11-23 12:53:30 -08:00
|
|
|
const fs = require("fs");
|
2022-02-08 15:31:55 -08:00
|
|
|
const fsp = require("fs/promises");
|
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
const os = require("os");
|
|
|
|
const { createHash } = require("crypto");
|
|
|
|
|
|
|
|
const fetch = require("node-fetch");
|
|
|
|
const Minio = require("minio");
|
|
|
|
|
|
|
|
const { initRedis } = require("./redis");
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
const util = require("util");
|
|
|
|
const getFolderSize = util.promisify(require("get-folder-size"));
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
// ===========================================================================
|
2021-11-23 12:53:30 -08:00
|
|
|
class S3StorageSync
|
|
|
|
{
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
constructor(urlOrData, {webhookUrl, userId, crawlId} = {}) {
|
2021-11-23 12:53:30 -08:00
|
|
|
let url;
|
|
|
|
let accessKey;
|
|
|
|
let secretKey;
|
|
|
|
|
|
|
|
if (typeof(urlOrData) === "string") {
|
|
|
|
url = new URL(urlOrData);
|
|
|
|
accessKey = url.username;
|
|
|
|
secretKey = url.password;
|
|
|
|
url.username = "";
|
|
|
|
url.password = "";
|
|
|
|
this.fullPrefix = url.href;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
url = new URL(urlOrData.endpointUrl);
|
|
|
|
accessKey = urlOrData.accessKey;
|
|
|
|
secretKey = urlOrData.secretKey;
|
|
|
|
this.fullPrefix = url.href;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.client = new Minio.Client({
|
|
|
|
endPoint: url.hostname,
|
|
|
|
port: Number(url.port) || (url.protocol === "https:" ? 443 : 80),
|
|
|
|
useSSL: url.protocol === "https:",
|
|
|
|
accessKey,
|
2022-02-08 15:31:55 -08:00
|
|
|
secretKey,
|
|
|
|
partSize: 100*1024*1024
|
2021-11-23 12:53:30 -08:00
|
|
|
});
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
this.client.enableSHA256 = true;
|
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
this.bucketName = url.pathname.slice(1).split("/")[0];
|
|
|
|
|
|
|
|
this.objectPrefix = url.pathname.slice(this.bucketName.length + 2);
|
|
|
|
|
|
|
|
this.resources = [];
|
|
|
|
|
|
|
|
this.userId = userId;
|
|
|
|
this.crawlId = crawlId;
|
|
|
|
this.webhookUrl = webhookUrl;
|
|
|
|
}
|
|
|
|
|
2022-05-05 14:27:17 -05:00
|
|
|
async uploadFile(srcFilename, targetFilename) {
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
console.log(`Bucket: ${this.bucketName}`);
|
|
|
|
console.log(`Crawl Id: ${this.crawlId}`);
|
|
|
|
console.log(`Prefix: ${this.objectPrefix}`);
|
|
|
|
console.log(`Target Filename: ${targetFilename}`);
|
|
|
|
|
2022-05-05 14:27:17 -05:00
|
|
|
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
|
|
|
|
|
|
|
const finalHash = await checksumFile("sha256", srcFilename);
|
2021-11-23 12:53:30 -08:00
|
|
|
|
2022-05-05 14:27:17 -05:00
|
|
|
const size = await getFileSize(srcFilename);
|
|
|
|
return {"path": targetFilename, "hash": finalHash, "bytes": size};
|
|
|
|
}
|
|
|
|
|
|
|
|
async downloadFile(srcFilename, destFilename) {
|
|
|
|
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
|
|
|
}
|
2021-11-23 12:53:30 -08:00
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
async uploadCollWACZ(srcFilename, targetFilename, completed = true) {
|
|
|
|
const resource = await this.uploadFile(srcFilename, targetFilename);
|
2022-02-08 15:31:55 -08:00
|
|
|
console.log(resource);
|
2021-11-23 12:53:30 -08:00
|
|
|
|
|
|
|
if (this.webhookUrl) {
|
|
|
|
const body = {
|
|
|
|
id: this.crawlId,
|
|
|
|
user: this.userId,
|
|
|
|
|
|
|
|
//filename: `s3://${this.bucketName}/${this.objectPrefix}${this.waczFilename}`,
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
filename: this.fullPrefix + targetFilename,
|
2021-11-23 12:53:30 -08:00
|
|
|
|
|
|
|
hash: resource.hash,
|
|
|
|
size: resource.bytes,
|
|
|
|
|
|
|
|
completed
|
|
|
|
};
|
|
|
|
|
|
|
|
console.log("Pinging Webhook: " + this.webhookUrl);
|
|
|
|
|
|
|
|
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
|
|
|
|
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
|
|
|
|
} else if (this.webhookUrl.startsWith("redis://")) {
|
|
|
|
const parts = this.webhookUrl.split("/");
|
|
|
|
if (parts.length !== 5) {
|
|
|
|
throw new Error("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
|
|
|
|
}
|
|
|
|
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
|
|
|
await redis.rpush(parts[4], JSON.stringify(body));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
function initStorage() {
|
2022-05-05 14:27:17 -05:00
|
|
|
if (!process.env.STORE_ENDPOINT_URL) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
|
|
|
const storeInfo = {
|
|
|
|
endpointUrl,
|
|
|
|
accessKey: process.env.STORE_ACCESS_KEY,
|
|
|
|
secretKey: process.env.STORE_SECRET_KEY,
|
|
|
|
};
|
|
|
|
|
|
|
|
const opts = {
|
|
|
|
crawlId: process.env.CRAWL_ID || os.hostname(),
|
|
|
|
webhookUrl: process.env.WEBHOOK_URL,
|
|
|
|
userId: process.env.STORE_USER,
|
|
|
|
};
|
|
|
|
|
|
|
|
console.log("Initing Storage...");
|
|
|
|
return new S3StorageSync(storeInfo, opts);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
async function getFileSize(filename) {
|
|
|
|
const stats = await fsp.stat(filename);
|
|
|
|
return stats.size;
|
|
|
|
}
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
async function getDirSize(dir) {
|
|
|
|
return await getFolderSize(dir);
|
|
|
|
}
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
function checksumFile(hashName, path) {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
const hash = createHash(hashName);
|
|
|
|
const stream = fs.createReadStream(path);
|
|
|
|
stream.on("error", err => reject(err));
|
|
|
|
stream.on("data", chunk => hash.update(chunk));
|
|
|
|
stream.on("end", () => resolve(hash.digest("hex")));
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
function interpolateFilename(filename, crawlId) {
|
|
|
|
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
|
|
|
filename = filename.replace("@hostname", os.hostname());
|
|
|
|
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
|
|
|
filename = filename.replace("@id", crawlId);
|
|
|
|
return filename;
|
|
|
|
}
|
|
|
|
|
2021-11-23 12:53:30 -08:00
|
|
|
module.exports.S3StorageSync = S3StorageSync;
|
2022-02-08 15:31:55 -08:00
|
|
|
module.exports.getFileSize = getFileSize;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
module.exports.getDirSize = getDirSize;
|
2022-05-05 14:27:17 -05:00
|
|
|
module.exports.initStorage = initStorage;
|
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
|
|
|
module.exports.interpolateFilename = interpolateFilename;
|