Add more exit codes to detect interruption reason (#764)

Fix #584

- Replace interrupted with interruptReason
- Distinct exit codes for different interrupt reasons: SizeLimit (14), TimeLimit (15), FailedLimit (12), DiskUtilization (16)
are used when an interrupt happens for these reasons, in addition to existing reasons BrowserCrashed (10),
SignalInterrupted (11) and SignalInterruptedForce (13)
- Doc fix to cli args

---------
Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
benoit74 2025-02-10 23:00:55 +01:00 committed by GitHub
parent 846f0355f6
commit fc56c2cf76
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 139 additions and 52 deletions

View file

@ -243,7 +243,7 @@ Options:
--maxPageRetries, --retries If set, number of times to retry a p
age that failed to load before page
is considered to have failed
[number] [default: 1]
[number] [default: 2]
--failOnFailedSeed If set, crawler will fail with exit
code 1 if any seed fails. When combi
ned with --failOnInvalidStatus,will

View file

@ -48,6 +48,7 @@ import {
PAGE_OP_TIMEOUT_SECS,
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
ExitCodes,
InterruptReason,
} from "./util/constants.js";
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
@ -168,8 +169,7 @@ export class Crawler {
skipTextDocs = 0;
interrupted = false;
browserCrashed = false;
interruptReason: InterruptReason | null = null;
finalExit = false;
uploadAndDeleteLocal = false;
done = false;
@ -307,7 +307,7 @@ export class Crawler {
this.healthChecker = null;
this.interrupted = false;
this.interruptReason = null;
this.finalExit = false;
this.uploadAndDeleteLocal = false;
@ -596,11 +596,28 @@ export class Crawler {
} else if (stopped) {
status = "done";
logger.info("Crawl gracefully stopped on request");
} else if (this.interrupted) {
} else if (this.interruptReason) {
status = "interrupted";
exitCode = this.browserCrashed
? ExitCodes.BrowserCrashed
: ExitCodes.InterruptedGraceful;
switch (this.interruptReason) {
case InterruptReason.SizeLimit:
exitCode = ExitCodes.SizeLimit;
break;
case InterruptReason.BrowserCrashed:
exitCode = ExitCodes.BrowserCrashed;
break;
case InterruptReason.SignalInterrupted:
exitCode = ExitCodes.SignalInterrupted;
break;
case InterruptReason.DiskUtilization:
exitCode = ExitCodes.DiskUtilization;
break;
case InterruptReason.FailedLimit:
exitCode = ExitCodes.FailedLimit;
break;
case InterruptReason.TimeLimit:
exitCode = ExitCodes.TimeLimit;
break;
}
}
}
} catch (e) {
@ -1378,7 +1395,7 @@ self.__bx_behaviors.selectMainBehavior();
}
async checkLimits() {
let interrupt = false;
let interrupt: InterruptReason | null = null;
const size = await this.updateCurrSize();
@ -1387,7 +1404,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.info(
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
);
interrupt = true;
interrupt = InterruptReason.SizeLimit;
}
}
@ -1397,7 +1414,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.info(
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
);
interrupt = true;
interrupt = InterruptReason.TimeLimit;
}
}
@ -1409,7 +1426,7 @@ self.__bx_behaviors.selectMainBehavior();
size,
);
if (diskUtil.stop === true) {
interrupt = true;
interrupt = InterruptReason.DiskUtilization;
}
}
@ -1419,18 +1436,21 @@ self.__bx_behaviors.selectMainBehavior();
if (numFailed >= failedLimit) {
logger.fatal(
`Failed threshold reached ${numFailed} >= ${failedLimit}, failing crawl`,
{},
"general",
ExitCodes.FailedLimit,
);
}
}
if (interrupt) {
this.uploadAndDeleteLocal = true;
this.gracefulFinishOnInterrupt();
this.gracefulFinishOnInterrupt(interrupt);
}
}
gracefulFinishOnInterrupt() {
this.interrupted = true;
gracefulFinishOnInterrupt(interruptReason: InterruptReason) {
this.interruptReason = interruptReason;
logger.info("Crawler interrupted, gracefully finishing current pages");
if (!this.params.waitOnDone && !this.params.restartsOnError) {
this.finalExit = true;
@ -1457,23 +1477,25 @@ self.__bx_behaviors.selectMainBehavior();
async serializeAndExit() {
await this.serializeConfig();
if (this.interrupted) {
await this.browser.close();
if (this.interruptReason) {
await closeWorkers(0);
await this.browser.close();
await this.closeFiles();
if (!this.done) {
await this.setStatusAndExit(
ExitCodes.InterruptedImmediate,
ExitCodes.SignalInterruptedForce,
"interrupted",
);
return;
}
}
await this.setStatusAndExit(ExitCodes.Success, "done");
}
async isCrawlRunning() {
if (this.interrupted) {
if (this.interruptReason) {
return false;
}
@ -1495,6 +1517,7 @@ self.__bx_behaviors.selectMainBehavior();
this.healthChecker = new HealthChecker(
this.params.healthCheckPort,
this.params.workers,
this.browser,
async () => {
await this.updateCurrSize();
},
@ -1726,7 +1749,7 @@ self.__bx_behaviors.selectMainBehavior();
if (
this.params.generateWACZ &&
!this.params.dryRun &&
(!this.interrupted || this.finalExit || this.uploadAndDeleteLocal)
(!this.interruptReason || this.finalExit || this.uploadAndDeleteLocal)
) {
const uploaded = await this.generateWACZ();
@ -1742,7 +1765,7 @@ self.__bx_behaviors.selectMainBehavior();
}
}
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
if (this.params.waitOnDone && (!this.interruptReason || this.finalExit)) {
this.done = true;
logger.info("All done, waiting for signal...");
await this.crawlState.setStatus("done");
@ -1753,11 +1776,8 @@ self.__bx_behaviors.selectMainBehavior();
}
markBrowserCrashed() {
this.interrupted = true;
this.browserCrashed = true;
if (this.healthChecker) {
this.healthChecker.browserCrashed = true;
}
this.interruptReason = InterruptReason.BrowserCrashed;
this.browser.crashed = true;
}
async closeLog(): Promise<void> {

View file

@ -5,7 +5,7 @@ import { setExitOnRedisError } from "./util/redis.js";
import { Crawler } from "./crawler.js";
import { ReplayCrawler } from "./replaycrawler.js";
import fs from "node:fs";
import { ExitCodes } from "./util/constants.js";
import { ExitCodes, InterruptReason } from "./util/constants.js";
let crawler: Crawler | null = null;
@ -29,9 +29,9 @@ async function handleTerminate(signame: string) {
try {
await crawler.checkCanceled();
if (!crawler.interrupted) {
logger.info("SIGNAL: gracefully finishing current pages...");
crawler.gracefulFinishOnInterrupt();
if (!crawler.interruptReason) {
logger.info("SIGNAL: interrupt request received...");
crawler.gracefulFinishOnInterrupt(InterruptReason.SignalInterrupted);
} else if (forceTerm || Date.now() - lastSigInt > 200) {
logger.info("SIGNAL: stopping crawl now...");
await crawler.serializeAndExit();

View file

@ -9,7 +9,11 @@ import path from "path";
import { formatErr, LogContext, logger } from "./logger.js";
import { initStorage } from "./storage.js";
import { DISPLAY, type ServiceWorkerOpt } from "./constants.js";
import {
DISPLAY,
PAGE_OP_TIMEOUT_SECS,
type ServiceWorkerOpt,
} from "./constants.js";
import puppeteer, {
Frame,
@ -20,6 +24,7 @@ import puppeteer, {
} from "puppeteer-core";
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
import { Recorder } from "./recorder.js";
import { timedRun } from "./timing.js";
type BtrixChromeOpts = {
proxy?: string;
@ -35,6 +40,7 @@ type LaunchOpts = {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
emulateDevice?: Record<string, any>;
ondisconnect?: ((err: unknown) => NonNullable<unknown>) | null;
swOpt?: ServiceWorkerOpt;
@ -61,6 +67,8 @@ export class Browser {
swOpt?: ServiceWorkerOpt = "disabled";
crashed = false;
constructor() {
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
}
@ -364,9 +372,24 @@ export class Browser {
}
async close() {
if (this.browser) {
if (!this.browser) {
return;
}
if (!this.crashed) {
this.browser.removeAllListeners("disconnected");
await this.browser.close();
try {
await timedRun(
this.browser.close(),
PAGE_OP_TIMEOUT_SECS,
"Closing Browser Timed Out",
{},
"browser",
true,
);
} catch (e) {
// ignore
}
this.browser = null;
}
}

View file

@ -63,8 +63,21 @@ export enum ExitCodes {
Failed = 9,
OutOfSpace = 3,
BrowserCrashed = 10,
InterruptedGraceful = 11,
InterruptedImmediate = 13,
SignalInterrupted = 11,
FailedLimit = 12,
SignalInterruptedForce = 13,
SizeLimit = 14,
TimeLimit = 15,
DiskUtilization = 16,
Fatal = 17,
ProxyError = 21,
}
export enum InterruptReason {
SizeLimit = 1,
TimeLimit = 2,
FailedLimit = 3,
DiskUtilization = 4,
BrowserCrashed = 5,
SignalInterrupted = 6,
}

View file

@ -1,13 +1,14 @@
import http from "http";
import url from "url";
import { logger } from "./logger.js";
import { Browser } from "./browser.js";
// ===========================================================================
export class HealthChecker {
port: number;
errorThreshold: number;
healthServer: http.Server;
browserCrashed = false;
browser: Browser;
updater: (() => Promise<void>) | null;
@ -16,9 +17,11 @@ export class HealthChecker {
constructor(
port: number,
errorThreshold: number,
browser: Browser,
updater: (() => Promise<void>) | null = null,
) {
this.port = port;
this.browser = browser;
this.errorThreshold = errorThreshold;
this.healthServer = http.createServer((...args) =>
@ -34,7 +37,7 @@ export class HealthChecker {
const pathname = req.url ? url.parse(req.url).pathname : "";
switch (pathname) {
case "/healthz":
if (this.errorCount < this.errorThreshold && !this.browserCrashed) {
if (this.errorCount < this.errorThreshold && !this.browser.crashed) {
logger.debug(
`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`,
{},

View file

@ -997,7 +997,7 @@ export class Recorder {
while (
numPending &&
!this.pageFinished &&
!this.crawler.interrupted &&
!this.crawler.interruptReason &&
!this.crawler.postCrawling
) {
pending = [];

View file

@ -7,7 +7,6 @@ import { rxEscape } from "./seeds.js";
import { CDPSession, Page } from "puppeteer-core";
import { PageState, WorkerId } from "./state.js";
import { Crawler } from "../crawler.js";
import { PAGE_OP_TIMEOUT_SECS } from "./constants.js";
const MAX_REUSE = 5;
@ -233,8 +232,7 @@ export class PageWorker {
}
if (retry >= MAX_REUSE) {
this.crawler.browserCrashed = true;
this.crawler.interrupted = true;
this.crawler.markBrowserCrashed();
throw new Error("Unable to load new page, browser needs restart");
}
@ -433,16 +431,7 @@ export async function runWorkers(
await closeWorkers();
if (!crawler.browserCrashed) {
await timedRun(
crawler.browser.close(),
PAGE_OP_TIMEOUT_SECS,
"Closing Browser Timed Out",
{},
"worker",
true,
);
}
await crawler.browser.close();
}
// ===========================================================================

View file

@ -25,7 +25,46 @@ test("ensure crawl fails if failOnFailedLimit is reached", async () => {
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/will404 --url https://specs.webrecorder.net --failOnInvalidStatus --failOnFailedLimit 1 --limit 10 --collection faillimitreached",
);
} catch (error) {
expect(error.code).toEqual(17);
expect(error.code).toEqual(12);
passed = false;
}
expect(passed).toBe(false);
});
test("ensure crawl fails if timeLimit is reached", async () => {
let passed = true;
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --timeLimit 1 --limit 2 --collection failontimelimitreached",
);
} catch (error) {
expect(error.code).toEqual(15);
passed = false;
}
expect(passed).toBe(false);
});
test("ensure crawl fails if sizeLimit is reached", async () => {
let passed = true;
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --sizeLimit 1 --limit 2 --collection failonsizelimitreached",
);
} catch (error) {
expect(error.code).toEqual(14);
passed = false;
}
expect(passed).toBe(false);
});
test("ensure crawl fails if diskUtilizationLimit is reached", async () => {
let passed = true;
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --diskUtilization 1 --limit 2 --collection failonsizelimitreached",
);
} catch (error) {
expect(error.code).toEqual(16);
passed = false;
}
expect(passed).toBe(false);