Fix disk utilization computation errors (#338)

* Check size of /crawls by default to fix disk utilization check

* Refactor calculating percentage used and add unit tests

* add tests using df output for with disk usage above and below
threshold

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2023-07-06 00:58:28 -04:00 committed by GitHub
parent 3049b957bd
commit 254da95a44
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 128 additions and 26 deletions

View file

@ -13,7 +13,7 @@ import * as warcio from "warcio";
import { HealthChecker } from "./util/healthcheck.js";
import { TextExtract } from "./util/textextract.js";
import { initStorage, getFileSize, getDirSize, interpolateFilename, getDiskUsage } from "./util/storage.js";
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization } from "./util/storage.js";
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js";
@ -610,28 +610,9 @@ export class Crawler {
}
if (this.params.diskUtilization) {
// Check that disk usage isn't already above threshold
const diskUsage = await getDiskUsage();
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
if (usedPercentage >= this.params.diskUtilization) {
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${this.params.diskUtilization}%, stopping`);
interrupt = true;
}
// Check that disk usage isn't likely to cross threshold
const kbUsed = parseInt(diskUsage["Used"]);
const kbTotal = parseInt(diskUsage["1K-blocks"]);
let kbArchiveDirSize = Math.floor(size/1024);
if (this.params.combineWARC && this.params.generateWACZ) {
kbArchiveDirSize *= 4;
} else if (this.params.combineWARC || this.params.generateWACZ) {
kbArchiveDirSize *= 2;
}
const projectedTotal = kbUsed + kbArchiveDirSize;
const projectedUsedPercentage = Math.floor(kbTotal/projectedTotal);
if (projectedUsedPercentage >= this.params.diskUtilization) {
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${this.params.diskUtilization}%, stopping`);
// Check that disk usage isn't already or soon to be above threshold
const diskUtil = await checkDiskUtilization(this.params, size);
if (diskUtil.stop === true) {
interrupt = true;
}
}

3
test-setup.js Normal file
View file

@ -0,0 +1,3 @@
import { jest } from "@jest/globals";
global.jest = jest;

62
tests/storage.test.js Normal file
View file

@ -0,0 +1,62 @@
import { calculatePercentageUsed, checkDiskUtilization } from "../util/storage.js";
test("ensure calculatePercentageUsed returns expected values", () => {
expect(calculatePercentageUsed(30, 100)).toEqual(30);
expect(calculatePercentageUsed(1507, 35750)).toEqual(4);
expect(calculatePercentageUsed(33819, 35750)).toEqual(95);
expect(calculatePercentageUsed(140, 70)).toEqual(200);
expect(calculatePercentageUsed(0, 5)).toEqual(0);
});
test("verify end-to-end disk utilization not exceeded threshold", async () => {
const params = {
diskUtilization: 90,
combineWARC: true,
generateWACZ: true
};
const mockDfOutput = `\
Filesystem 1K-blocks Used Available Use% Mounted on
grpcfuse 1000000 285000 715000 28% /crawls`;
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
// does not exceed 90% threshold
const returnValue = await checkDiskUtilization(params, 5000 * 1024, mockDfOutput);
expect(returnValue).toEqual({
stop: false,
used: 28,
projected: 31,
threshold: 90
});
});
test("verify end-to-end disk utilization exceeds threshold", async () => {
const params = {
diskUtilization: 90,
combineWARC: false,
generateWACZ: true
};
const mockDfOutput = `\
Filesystem 1K-blocks Used Available Use% Mounted on
grpcfuse 100000 85000 15000 85% /crawls`;
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
// exceeds 90% threshold
const returnValue = await checkDiskUtilization(params, 3000 * 1024, mockDfOutput);
expect(returnValue).toEqual({
stop: true,
used: 85,
projected: 91,
threshold: 90
});
});

View file

@ -150,10 +150,62 @@ export async function getDirSize(dir) {
return size;
}
export async function getDiskUsage(path="/") {
export async function checkDiskUtilization(params, archiveDirSize, dfOutput=null) {
const diskUsage = await getDiskUsage("/crawls", dfOutput);
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
// Check that disk usage isn't already above threshold
if (usedPercentage >= params.diskUtilization) {
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`);
return {
stop: true,
used: usedPercentage,
projected: null,
threshold: params.diskUtilization
};
}
// Check that disk usage isn't likely to cross threshold
const kbUsed = parseInt(diskUsage["Used"]);
const kbTotal = parseInt(diskUsage["1K-blocks"]);
let kbArchiveDirSize = Math.round(archiveDirSize/1024);
if (params.combineWARC && params.generateWACZ) {
kbArchiveDirSize *= 4;
} else if (params.combineWARC || params.generateWACZ) {
kbArchiveDirSize *= 2;
}
const projectedTotal = kbUsed + kbArchiveDirSize;
const projectedUsedPercentage = calculatePercentageUsed(projectedTotal, kbTotal);
if (projectedUsedPercentage >= params.diskUtilization) {
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`);
return {
stop: true,
used: usedPercentage,
projected: projectedUsedPercentage,
threshold: params.diskUtilization
};
}
return {
stop: false,
used: usedPercentage,
projected: projectedUsedPercentage,
threshold: params.diskUtilization
};
}
export async function getDFOutput(path) {
const exec = util.promisify(child_process.exec);
const result = await exec(`df ${path}`);
const lines = result.stdout.split("\n");
const res = await exec(`df ${path}`);
return res.stdout;
}
export async function getDiskUsage(path="/crawls", dfOutput = null) {
const result = dfOutput || await getDFOutput(path);
const lines = result.split("\n");
const keys = lines[0].split(/\s+/ig);
const rows = lines.slice(1).map(line => {
const values = line.split(/\s+/ig);
@ -165,6 +217,10 @@ export async function getDiskUsage(path="/") {
return rows[0];
}
export function calculatePercentageUsed(used, total) {
return Math.round((used/total) * 100);
}
function checksumFile(hashName, path) {
return new Promise((resolve, reject) => {
const hash = createHash(hashName);