mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
more convos
This commit is contained in:
parent
e5fa61d4cf
commit
dfb0ee6b32
3 changed files with 663 additions and 1 deletions
410
src/util/browser.ts
Normal file
410
src/util/browser.ts
Normal file
|
@ -0,0 +1,410 @@
|
|||
import * as child_process from "child_process";
|
||||
import fs from "fs";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import { Readable } from "node:stream";
|
||||
|
||||
import os from "os";
|
||||
import path from "path";
|
||||
|
||||
import { logger } from "./logger.js";
|
||||
import { initStorage } from "./storage.js";
|
||||
|
||||
import puppeteer from "puppeteer-core";
|
||||
import { CDPSession, Target, Browser as PBrowser } from "puppeteer-core";
|
||||
|
||||
|
||||
// ==================================================================
|
||||
export class Browser
|
||||
{
|
||||
profileDir: string;
|
||||
customProfile = false;
|
||||
emulateDevice = null;
|
||||
|
||||
browser?: PBrowser = null;
|
||||
firstCDP: CDPSession = null;
|
||||
|
||||
recorders: any[] = [];
|
||||
|
||||
constructor() {
|
||||
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
}
|
||||
|
||||
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null}) {
|
||||
if (this.isLaunched()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (profileUrl) {
|
||||
this.customProfile = await this.loadProfile(profileUrl);
|
||||
}
|
||||
|
||||
this.emulateDevice = emulateDevice;
|
||||
|
||||
const args = this.chromeArgs(chromeOptions);
|
||||
|
||||
let defaultViewport = null;
|
||||
|
||||
if (process.env.GEOMETRY) {
|
||||
const geom = process.env.GEOMETRY.split("x");
|
||||
|
||||
defaultViewport = {width: Number(geom[0]), height: Number(geom[1])};
|
||||
}
|
||||
|
||||
const launchOpts = {
|
||||
args,
|
||||
headless: headless ? "new" : false,
|
||||
executablePath: this.getBrowserExe(),
|
||||
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
|
||||
ignoreHTTPSErrors: true,
|
||||
handleSIGHUP: signals,
|
||||
handleSIGINT: signals,
|
||||
handleSIGTERM: signals,
|
||||
protocolTimeout: 0,
|
||||
|
||||
defaultViewport,
|
||||
waitForInitialPage: false,
|
||||
userDataDir: this.profileDir
|
||||
};
|
||||
|
||||
await this._init(launchOpts, ondisconnect);
|
||||
}
|
||||
|
||||
async setupPage({page}) {
|
||||
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
|
||||
if (this.customProfile) {
|
||||
logger.info("Disabling Service Workers for profile", {}, "browser");
|
||||
|
||||
await page.setBypassServiceWorker(true);
|
||||
}
|
||||
}
|
||||
|
||||
async loadProfile(profileFilename) {
|
||||
const targetFilename = "/tmp/profile.tar.gz";
|
||||
|
||||
if (profileFilename &&
|
||||
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
||||
|
||||
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
|
||||
|
||||
const resp = await fetch(profileFilename);
|
||||
await pipeline(
|
||||
Readable.fromWeb(resp.body as any),
|
||||
fs.createWriteStream(targetFilename)
|
||||
);
|
||||
|
||||
profileFilename = targetFilename;
|
||||
} else if (profileFilename && profileFilename.startsWith("@")) {
|
||||
const storage = initStorage();
|
||||
|
||||
if (!storage) {
|
||||
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||
}
|
||||
|
||||
await storage.downloadFile(profileFilename.slice(1), targetFilename);
|
||||
|
||||
profileFilename = targetFilename;
|
||||
}
|
||||
|
||||
if (profileFilename) {
|
||||
try {
|
||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: this.profileDir});
|
||||
return true;
|
||||
} catch (e) {
|
||||
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
saveProfile(profileFilename) {
|
||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
|
||||
}
|
||||
|
||||
chromeArgs({proxy=true, userAgent=null, extraArgs=[]} = {}) {
|
||||
// Chrome Flags, including proxy server
|
||||
const args = [
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
...defaultArgs,
|
||||
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
||||
//"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||
"--no-sandbox",
|
||||
"--disable-background-media-suspend",
|
||||
"--remote-debugging-port=9221",
|
||||
"--remote-allow-origins=*",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-site-isolation-trials",
|
||||
`--user-agent=${userAgent || this.getDefaultUA()}`,
|
||||
...extraArgs,
|
||||
];
|
||||
|
||||
if (proxy) {
|
||||
args.push("--ignore-certificate-errors");
|
||||
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
getDefaultUA() {
|
||||
let version = process.env.BROWSER_VERSION;
|
||||
|
||||
try {
|
||||
version = child_process.execFileSync(this.getBrowserExe(), ["--version"], {encoding: "utf8"});
|
||||
version = version.match(/[\d.]+/)[0];
|
||||
} catch(e) {
|
||||
console.error(e);
|
||||
}
|
||||
|
||||
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
}
|
||||
|
||||
getBrowserExe() {
|
||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
||||
for (const file of files) {
|
||||
if (file && fs.existsSync(file)) {
|
||||
return file;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName) {
|
||||
const frameUrl = frame.url();
|
||||
let details = {frameUrl, ...logData};
|
||||
|
||||
if (!frameUrl || frame.isDetached()) {
|
||||
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName);
|
||||
return false;
|
||||
}
|
||||
|
||||
logger.info("Run Script Started", details, contextName);
|
||||
|
||||
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
|
||||
//const contextId = context._contextId;
|
||||
const expression = funcString + "\n//# sourceURL=__evaluation_script__";
|
||||
|
||||
const { exceptionDetails, result } = await cdp
|
||||
.send("Runtime.evaluate", {
|
||||
expression,
|
||||
contextId: cdpContextId,
|
||||
returnByValue: true,
|
||||
awaitPromise: true,
|
||||
userGesture: true,
|
||||
includeCommandLineAPI: true,
|
||||
});
|
||||
|
||||
if (exceptionDetails) {
|
||||
if (exceptionDetails.stackTrace) {
|
||||
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details};
|
||||
}
|
||||
logger.error("Run Script Failed", details, contextName);
|
||||
} else {
|
||||
logger.info("Run Script Finished", details, contextName);
|
||||
}
|
||||
|
||||
return result.value;
|
||||
}
|
||||
|
||||
isLaunched() {
|
||||
if (this.browser) {
|
||||
logger.warn("Context already inited", {}, "browser");
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async close() {
|
||||
if (this.browser) {
|
||||
this.browser.removeAllListeners("disconnected");
|
||||
await this.browser.close();
|
||||
this.browser = null;
|
||||
}
|
||||
}
|
||||
|
||||
addInitScript(page, script) {
|
||||
return page.evaluateOnNewDocument(script);
|
||||
}
|
||||
|
||||
async _init(launchOpts, ondisconnect = null) {
|
||||
this.browser = await puppeteer.launch(launchOpts);
|
||||
|
||||
const target = this.browser.target();
|
||||
|
||||
this.firstCDP = await target.createCDPSession();
|
||||
|
||||
await this.serviceWorkerFetch();
|
||||
|
||||
if (ondisconnect) {
|
||||
this.browser.on("disconnected", (err) => ondisconnect(err));
|
||||
}
|
||||
this.browser.on("disconnected", () => {
|
||||
this.browser = null;
|
||||
});
|
||||
}
|
||||
|
||||
async newWindowPageWithCDP() {
|
||||
// unique url to detect new pages
|
||||
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||
|
||||
const p = new Promise<Target>((resolve) => {
|
||||
const listener = (target) => {
|
||||
if (target.url() === startPage) {
|
||||
resolve(target);
|
||||
this.browser.removeListener("targetcreated", listener);
|
||||
}
|
||||
};
|
||||
|
||||
this.browser.on("targetcreated", listener);
|
||||
});
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
||||
} catch (e) {
|
||||
if (!this.browser) {
|
||||
throw e;
|
||||
}
|
||||
const target = this.browser.target();
|
||||
|
||||
this.firstCDP = await target.createCDPSession();
|
||||
|
||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
||||
}
|
||||
|
||||
const target = await p;
|
||||
|
||||
const page = await target.page();
|
||||
|
||||
const device = this.emulateDevice;
|
||||
|
||||
if (device) {
|
||||
if (device.viewport && device.userAgent) {
|
||||
await page.emulate(device);
|
||||
} else if (device.userAgent) {
|
||||
await page.setUserAgent(device.userAgent);
|
||||
}
|
||||
}
|
||||
|
||||
const cdp = await target.createCDPSession();
|
||||
|
||||
return {page, cdp};
|
||||
}
|
||||
|
||||
async serviceWorkerFetch() {
|
||||
this.firstCDP.on("Fetch.requestPaused", async (params) => {
|
||||
const { frameId, requestId, networkId, request } = params;
|
||||
|
||||
if (networkId) {
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
||||
} catch (e) {
|
||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let foundRecorder = null;
|
||||
|
||||
for (const recorder of this.recorders) {
|
||||
if (recorder.swUrls.has(request.url)) {
|
||||
//console.log(`*** found sw ${request.url} in recorder for worker ${recorder.workerid}`);
|
||||
recorder.swFrameIds.add(frameId);
|
||||
}
|
||||
|
||||
if (recorder.swFrameIds && recorder.swFrameIds.has(frameId)) {
|
||||
foundRecorder = recorder;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundRecorder) {
|
||||
logger.warn("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
||||
} catch (e) {
|
||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
await foundRecorder.handleRequestPaused(params, this.firstCDP, true);
|
||||
});
|
||||
|
||||
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
|
||||
}
|
||||
|
||||
async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) {
|
||||
const context = await frame.executionContext();
|
||||
cdp = context._client;
|
||||
const cdpContextId = context._contextId;
|
||||
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName);
|
||||
}
|
||||
|
||||
interceptRequest(page, callback) {
|
||||
page.on("request", callback);
|
||||
}
|
||||
|
||||
async waitForNetworkIdle(page, params) {
|
||||
return await page.waitForNetworkIdle(params);
|
||||
}
|
||||
|
||||
async setViewport(page, params) {
|
||||
await page.setViewport(params);
|
||||
}
|
||||
|
||||
async getCookies(page) {
|
||||
return await page.cookies();
|
||||
}
|
||||
|
||||
async setCookies(page, cookies) {
|
||||
return await page.setCookie(...cookies);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ==================================================================
|
||||
// Default Chromium args from playwright
|
||||
export const defaultArgs = [
|
||||
"--disable-field-trial-config", // https://source.chromium.org/chromium/chromium/src/+/main:testing/variations/README.md
|
||||
"--disable-background-networking",
|
||||
"--enable-features=NetworkService,NetworkServiceInProcess",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-back-forward-cache", // Avoids surprises like main request not being intercepted during page.goBack().
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-component-update", // Avoids unneeded network activity after startup.
|
||||
"--no-default-browser-check",
|
||||
"--disable-default-apps",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-extensions",
|
||||
// AvoidUnnecessaryBeforeUnloadCheckSync - https://github.com/microsoft/playwright/issues/14047
|
||||
// Translate - https://github.com/microsoft/playwright/issues/16126
|
||||
// Optimization* - https://bugs.chromium.org/p/chromium/issues/detail?id=1311753
|
||||
"--disable-features=ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate,OptimizationGuideModelDownloading,OptimizationHintsFetching,OptimizationTargetPrediction,OptimizationHints",
|
||||
"--allow-pre-commit-input",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--no-first-run",
|
||||
"--enable-automation",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
// See https://chromium-review.googlesource.com/c/chromium/src/+/2436773
|
||||
"--no-service-autorun",
|
||||
"--export-tagged-pdf",
|
||||
"--apps-keep-chrome-alive-in-tests",
|
||||
"--apps-gallery-url=https://invalid.webstore.example.com/",
|
||||
"--apps-gallery-update-url=https://invalid.webstore.example.com/"
|
||||
];
|
252
src/util/storage.ts
Normal file
252
src/util/storage.ts
Normal file
|
@ -0,0 +1,252 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
import fsp from "fs/promises";
|
||||
import util from "util";
|
||||
|
||||
import os from "os";
|
||||
import { createHash } from "crypto";
|
||||
|
||||
import Minio from "minio";
|
||||
|
||||
import { initRedis } from "./redis.js";
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
import getFolderSize from "get-folder-size";
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class S3StorageSync
|
||||
{
|
||||
fullPrefix: string;
|
||||
client: Minio.Client;
|
||||
|
||||
bucketName: string;
|
||||
objectPrefix: string;
|
||||
resources: object[] = [];
|
||||
|
||||
userId: string;
|
||||
crawlId: string;
|
||||
webhookUrl: string;
|
||||
|
||||
constructor(urlOrData, {webhookUrl, userId, crawlId} : {webhookUrl: string, userId: string, crawlId: string}) {
|
||||
let url;
|
||||
let accessKey;
|
||||
let secretKey;
|
||||
|
||||
if (typeof(urlOrData) === "string") {
|
||||
url = new URL(urlOrData);
|
||||
accessKey = url.username;
|
||||
secretKey = url.password;
|
||||
url.username = "";
|
||||
url.password = "";
|
||||
this.fullPrefix = url.href;
|
||||
|
||||
} else {
|
||||
url = new URL(urlOrData.endpointUrl);
|
||||
accessKey = urlOrData.accessKey;
|
||||
secretKey = urlOrData.secretKey;
|
||||
this.fullPrefix = url.href;
|
||||
}
|
||||
|
||||
this.client = new Minio.Client({
|
||||
endPoint: url.hostname,
|
||||
port: Number(url.port) || (url.protocol === "https:" ? 443 : 80),
|
||||
useSSL: url.protocol === "https:",
|
||||
accessKey,
|
||||
secretKey,
|
||||
partSize: 100*1024*1024
|
||||
});
|
||||
|
||||
this.client.enableSHA256 = true;
|
||||
|
||||
this.bucketName = url.pathname.slice(1).split("/")[0];
|
||||
|
||||
this.objectPrefix = url.pathname.slice(this.bucketName.length + 2);
|
||||
|
||||
this.resources = [];
|
||||
|
||||
this.userId = userId;
|
||||
this.crawlId = crawlId;
|
||||
this.webhookUrl = webhookUrl;
|
||||
}
|
||||
|
||||
async uploadFile(srcFilename, targetFilename) {
|
||||
const fileUploadInfo = {
|
||||
"bucket": this.bucketName,
|
||||
"crawlId": this.crawlId,
|
||||
"prefix": this.objectPrefix,
|
||||
targetFilename
|
||||
};
|
||||
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
||||
|
||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
||||
|
||||
const finalHash = await checksumFile("sha256", srcFilename);
|
||||
|
||||
const size = await getFileSize(srcFilename);
|
||||
return {"path": targetFilename, "hash": finalHash, "bytes": size};
|
||||
}
|
||||
|
||||
async downloadFile(srcFilename, destFilename) {
|
||||
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
||||
}
|
||||
|
||||
async uploadCollWACZ(srcFilename, targetFilename, completed = true) {
|
||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||
logger.info("WACZ S3 file upload resource", resource, "s3Upload");
|
||||
|
||||
if (this.webhookUrl) {
|
||||
const body = {
|
||||
id: this.crawlId,
|
||||
user: this.userId,
|
||||
|
||||
//filename: `s3://${this.bucketName}/${this.objectPrefix}${this.waczFilename}`,
|
||||
filename: this.fullPrefix + targetFilename,
|
||||
|
||||
hash: resource.hash,
|
||||
size: resource.bytes,
|
||||
|
||||
completed
|
||||
};
|
||||
|
||||
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
|
||||
|
||||
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
|
||||
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
|
||||
} else if (this.webhookUrl.startsWith("redis://")) {
|
||||
const parts = this.webhookUrl.split("/");
|
||||
if (parts.length !== 5) {
|
||||
logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
|
||||
}
|
||||
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
||||
await redis.rpush(parts[4], JSON.stringify(body));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function initStorage() {
|
||||
if (!process.env.STORE_ENDPOINT_URL) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||
const storeInfo = {
|
||||
endpointUrl,
|
||||
accessKey: process.env.STORE_ACCESS_KEY,
|
||||
secretKey: process.env.STORE_SECRET_KEY,
|
||||
};
|
||||
|
||||
const opts = {
|
||||
crawlId: process.env.CRAWL_ID || os.hostname(),
|
||||
webhookUrl: process.env.WEBHOOK_URL,
|
||||
userId: process.env.STORE_USER,
|
||||
};
|
||||
|
||||
logger.info("Initing Storage...");
|
||||
return new S3StorageSync(storeInfo, opts);
|
||||
}
|
||||
|
||||
|
||||
export async function getFileSize(filename) {
|
||||
const stats = await fsp.stat(filename);
|
||||
return stats.size;
|
||||
}
|
||||
|
||||
export async function getDirSize(dir) {
|
||||
const { size, errors } = await getFolderSize(dir);
|
||||
if (errors && errors.length) {
|
||||
logger.warn("Size check errors", {errors}, "sizecheck");
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
export async function checkDiskUtilization(params, archiveDirSize, dfOutput=null) {
|
||||
const diskUsage = await getDiskUsage("/crawls", dfOutput);
|
||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||
|
||||
// Check that disk usage isn't already above threshold
|
||||
if (usedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: null,
|
||||
threshold: params.diskUtilization
|
||||
};
|
||||
}
|
||||
|
||||
// Check that disk usage isn't likely to cross threshold
|
||||
const kbUsed = parseInt(diskUsage["Used"]);
|
||||
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
||||
|
||||
let kbArchiveDirSize = Math.round(archiveDirSize/1024);
|
||||
if (params.combineWARC && params.generateWACZ) {
|
||||
kbArchiveDirSize *= 4;
|
||||
} else if (params.combineWARC || params.generateWACZ) {
|
||||
kbArchiveDirSize *= 2;
|
||||
}
|
||||
|
||||
const projectedTotal = kbUsed + kbArchiveDirSize;
|
||||
const projectedUsedPercentage = calculatePercentageUsed(projectedTotal, kbTotal);
|
||||
|
||||
if (projectedUsedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
stop: false,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
};
|
||||
}
|
||||
|
||||
export async function getDFOutput(path) {
|
||||
const exec = util.promisify(child_process.exec);
|
||||
const res = await exec(`df ${path}`);
|
||||
return res.stdout;
|
||||
}
|
||||
|
||||
export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
||||
const result = dfOutput || await getDFOutput(path);
|
||||
const lines = result.split("\n");
|
||||
const keys = lines[0].split(/\s+/ig);
|
||||
const rows = lines.slice(1).map(line => {
|
||||
const values = line.split(/\s+/ig);
|
||||
return keys.reduce((o, k, index) => {
|
||||
o[k] = values[index];
|
||||
return o;
|
||||
}, {});
|
||||
});
|
||||
return rows[0];
|
||||
}
|
||||
|
||||
export function calculatePercentageUsed(used, total) {
|
||||
return Math.round((used/total) * 100);
|
||||
}
|
||||
|
||||
function checksumFile(hashName, path) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const hash = createHash(hashName);
|
||||
const stream = fs.createReadStream(path);
|
||||
stream.on("error", err => reject(err));
|
||||
stream.on("data", chunk => hash.update(chunk));
|
||||
stream.on("end", () => resolve(hash.digest("hex")));
|
||||
});
|
||||
}
|
||||
|
||||
export function interpolateFilename(filename, crawlId) {
|
||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
||||
filename = filename.replace("@hostname", os.hostname());
|
||||
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
||||
filename = filename.replace("@id", crawlId);
|
||||
return filename;
|
||||
}
|
||||
|
|
@ -89,7 +89,7 @@ export class BaseBrowser
|
|||
|
||||
profileFilename = targetFilename;
|
||||
} else if (profileFilename && profileFilename.startsWith("@")) {
|
||||
const storage = initStorage("");
|
||||
const storage = initStorage();
|
||||
|
||||
if (!storage) {
|
||||
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue