mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Fix disk utilization computation errors (#338)
* Check size of /crawls by default to fix disk utilization check * Refactor calculating percentage used and add unit tests * add tests using df output for with disk usage above and below threshold --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
3049b957bd
commit
254da95a44
4 changed files with 128 additions and 26 deletions
27
crawler.js
27
crawler.js
|
@ -13,7 +13,7 @@ import * as warcio from "warcio";
|
|||
|
||||
import { HealthChecker } from "./util/healthcheck.js";
|
||||
import { TextExtract } from "./util/textextract.js";
|
||||
import { initStorage, getFileSize, getDirSize, interpolateFilename, getDiskUsage } from "./util/storage.js";
|
||||
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization } from "./util/storage.js";
|
||||
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
|
||||
import { Screenshots } from "./util/screenshots.js";
|
||||
import { parseArgs } from "./util/argParser.js";
|
||||
|
@ -610,28 +610,9 @@ export class Crawler {
|
|||
}
|
||||
|
||||
if (this.params.diskUtilization) {
|
||||
// Check that disk usage isn't already above threshold
|
||||
const diskUsage = await getDiskUsage();
|
||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||
if (usedPercentage >= this.params.diskUtilization) {
|
||||
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${this.params.diskUtilization}%, stopping`);
|
||||
interrupt = true;
|
||||
}
|
||||
|
||||
// Check that disk usage isn't likely to cross threshold
|
||||
const kbUsed = parseInt(diskUsage["Used"]);
|
||||
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
||||
let kbArchiveDirSize = Math.floor(size/1024);
|
||||
if (this.params.combineWARC && this.params.generateWACZ) {
|
||||
kbArchiveDirSize *= 4;
|
||||
} else if (this.params.combineWARC || this.params.generateWACZ) {
|
||||
kbArchiveDirSize *= 2;
|
||||
}
|
||||
|
||||
const projectedTotal = kbUsed + kbArchiveDirSize;
|
||||
const projectedUsedPercentage = Math.floor(kbTotal/projectedTotal);
|
||||
if (projectedUsedPercentage >= this.params.diskUtilization) {
|
||||
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${this.params.diskUtilization}%, stopping`);
|
||||
// Check that disk usage isn't already or soon to be above threshold
|
||||
const diskUtil = await checkDiskUtilization(this.params, size);
|
||||
if (diskUtil.stop === true) {
|
||||
interrupt = true;
|
||||
}
|
||||
}
|
||||
|
|
3
test-setup.js
Normal file
3
test-setup.js
Normal file
|
@ -0,0 +1,3 @@
|
|||
import { jest } from "@jest/globals";
|
||||
|
||||
global.jest = jest;
|
62
tests/storage.test.js
Normal file
62
tests/storage.test.js
Normal file
|
@ -0,0 +1,62 @@
|
|||
import { calculatePercentageUsed, checkDiskUtilization } from "../util/storage.js";
|
||||
|
||||
|
||||
test("ensure calculatePercentageUsed returns expected values", () => {
|
||||
expect(calculatePercentageUsed(30, 100)).toEqual(30);
|
||||
|
||||
expect(calculatePercentageUsed(1507, 35750)).toEqual(4);
|
||||
|
||||
expect(calculatePercentageUsed(33819, 35750)).toEqual(95);
|
||||
|
||||
expect(calculatePercentageUsed(140, 70)).toEqual(200);
|
||||
|
||||
expect(calculatePercentageUsed(0, 5)).toEqual(0);
|
||||
});
|
||||
|
||||
|
||||
test("verify end-to-end disk utilization not exceeded threshold", async () => {
|
||||
|
||||
const params = {
|
||||
diskUtilization: 90,
|
||||
combineWARC: true,
|
||||
generateWACZ: true
|
||||
};
|
||||
|
||||
const mockDfOutput = `\
|
||||
Filesystem 1K-blocks Used Available Use% Mounted on
|
||||
grpcfuse 1000000 285000 715000 28% /crawls`;
|
||||
|
||||
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
|
||||
// does not exceed 90% threshold
|
||||
const returnValue = await checkDiskUtilization(params, 5000 * 1024, mockDfOutput);
|
||||
expect(returnValue).toEqual({
|
||||
stop: false,
|
||||
used: 28,
|
||||
projected: 31,
|
||||
threshold: 90
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
test("verify end-to-end disk utilization exceeds threshold", async () => {
|
||||
|
||||
const params = {
|
||||
diskUtilization: 90,
|
||||
combineWARC: false,
|
||||
generateWACZ: true
|
||||
};
|
||||
|
||||
const mockDfOutput = `\
|
||||
Filesystem 1K-blocks Used Available Use% Mounted on
|
||||
grpcfuse 100000 85000 15000 85% /crawls`;
|
||||
|
||||
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
|
||||
// exceeds 90% threshold
|
||||
const returnValue = await checkDiskUtilization(params, 3000 * 1024, mockDfOutput);
|
||||
expect(returnValue).toEqual({
|
||||
stop: true,
|
||||
used: 85,
|
||||
projected: 91,
|
||||
threshold: 90
|
||||
});
|
||||
});
|
|
@ -150,10 +150,62 @@ export async function getDirSize(dir) {
|
|||
return size;
|
||||
}
|
||||
|
||||
export async function getDiskUsage(path="/") {
|
||||
export async function checkDiskUtilization(params, archiveDirSize, dfOutput=null) {
|
||||
const diskUsage = await getDiskUsage("/crawls", dfOutput);
|
||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||
|
||||
// Check that disk usage isn't already above threshold
|
||||
if (usedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: null,
|
||||
threshold: params.diskUtilization
|
||||
};
|
||||
}
|
||||
|
||||
// Check that disk usage isn't likely to cross threshold
|
||||
const kbUsed = parseInt(diskUsage["Used"]);
|
||||
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
||||
|
||||
let kbArchiveDirSize = Math.round(archiveDirSize/1024);
|
||||
if (params.combineWARC && params.generateWACZ) {
|
||||
kbArchiveDirSize *= 4;
|
||||
} else if (params.combineWARC || params.generateWACZ) {
|
||||
kbArchiveDirSize *= 2;
|
||||
}
|
||||
|
||||
const projectedTotal = kbUsed + kbArchiveDirSize;
|
||||
const projectedUsedPercentage = calculatePercentageUsed(projectedTotal, kbTotal);
|
||||
|
||||
if (projectedUsedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
stop: false,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
};
|
||||
}
|
||||
|
||||
export async function getDFOutput(path) {
|
||||
const exec = util.promisify(child_process.exec);
|
||||
const result = await exec(`df ${path}`);
|
||||
const lines = result.stdout.split("\n");
|
||||
const res = await exec(`df ${path}`);
|
||||
return res.stdout;
|
||||
}
|
||||
|
||||
export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
||||
const result = dfOutput || await getDFOutput(path);
|
||||
const lines = result.split("\n");
|
||||
const keys = lines[0].split(/\s+/ig);
|
||||
const rows = lines.slice(1).map(line => {
|
||||
const values = line.split(/\s+/ig);
|
||||
|
@ -165,6 +217,10 @@ export async function getDiskUsage(path="/") {
|
|||
return rows[0];
|
||||
}
|
||||
|
||||
export function calculatePercentageUsed(used, total) {
|
||||
return Math.round((used/total) * 100);
|
||||
}
|
||||
|
||||
function checksumFile(hashName, path) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const hash = createHash(hashName);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue