mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add more exit codes to detect interruption reason (#764)
Fix #584 - Replace interrupted with interruptReason - Distinct exit codes for different interrupt reasons: SizeLimit (14), TimeLimit (15), FailedLimit (12), DiskUtilization (16) are used when an interrupt happens for these reasons, in addition to existing reasons BrowserCrashed (10), SignalInterrupted (11) and SignalInterruptedForce (13) - Doc fix to cli args --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
846f0355f6
commit
fc56c2cf76
9 changed files with 139 additions and 52 deletions
|
@ -243,7 +243,7 @@ Options:
|
|||
--maxPageRetries, --retries If set, number of times to retry a p
|
||||
age that failed to load before page
|
||||
is considered to have failed
|
||||
[number] [default: 1]
|
||||
[number] [default: 2]
|
||||
--failOnFailedSeed If set, crawler will fail with exit
|
||||
code 1 if any seed fails. When combi
|
||||
ned with --failOnInvalidStatus,will
|
||||
|
|
|
@ -48,6 +48,7 @@ import {
|
|||
PAGE_OP_TIMEOUT_SECS,
|
||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||
ExitCodes,
|
||||
InterruptReason,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||
|
@ -168,8 +169,7 @@ export class Crawler {
|
|||
|
||||
skipTextDocs = 0;
|
||||
|
||||
interrupted = false;
|
||||
browserCrashed = false;
|
||||
interruptReason: InterruptReason | null = null;
|
||||
finalExit = false;
|
||||
uploadAndDeleteLocal = false;
|
||||
done = false;
|
||||
|
@ -307,7 +307,7 @@ export class Crawler {
|
|||
|
||||
this.healthChecker = null;
|
||||
|
||||
this.interrupted = false;
|
||||
this.interruptReason = null;
|
||||
this.finalExit = false;
|
||||
this.uploadAndDeleteLocal = false;
|
||||
|
||||
|
@ -596,11 +596,28 @@ export class Crawler {
|
|||
} else if (stopped) {
|
||||
status = "done";
|
||||
logger.info("Crawl gracefully stopped on request");
|
||||
} else if (this.interrupted) {
|
||||
} else if (this.interruptReason) {
|
||||
status = "interrupted";
|
||||
exitCode = this.browserCrashed
|
||||
? ExitCodes.BrowserCrashed
|
||||
: ExitCodes.InterruptedGraceful;
|
||||
switch (this.interruptReason) {
|
||||
case InterruptReason.SizeLimit:
|
||||
exitCode = ExitCodes.SizeLimit;
|
||||
break;
|
||||
case InterruptReason.BrowserCrashed:
|
||||
exitCode = ExitCodes.BrowserCrashed;
|
||||
break;
|
||||
case InterruptReason.SignalInterrupted:
|
||||
exitCode = ExitCodes.SignalInterrupted;
|
||||
break;
|
||||
case InterruptReason.DiskUtilization:
|
||||
exitCode = ExitCodes.DiskUtilization;
|
||||
break;
|
||||
case InterruptReason.FailedLimit:
|
||||
exitCode = ExitCodes.FailedLimit;
|
||||
break;
|
||||
case InterruptReason.TimeLimit:
|
||||
exitCode = ExitCodes.TimeLimit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
|
@ -1378,7 +1395,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
async checkLimits() {
|
||||
let interrupt = false;
|
||||
let interrupt: InterruptReason | null = null;
|
||||
|
||||
const size = await this.updateCurrSize();
|
||||
|
||||
|
@ -1387,7 +1404,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
|
||||
);
|
||||
interrupt = true;
|
||||
interrupt = InterruptReason.SizeLimit;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1397,7 +1414,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
|
||||
);
|
||||
interrupt = true;
|
||||
interrupt = InterruptReason.TimeLimit;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1409,7 +1426,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
size,
|
||||
);
|
||||
if (diskUtil.stop === true) {
|
||||
interrupt = true;
|
||||
interrupt = InterruptReason.DiskUtilization;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1419,18 +1436,21 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (numFailed >= failedLimit) {
|
||||
logger.fatal(
|
||||
`Failed threshold reached ${numFailed} >= ${failedLimit}, failing crawl`,
|
||||
{},
|
||||
"general",
|
||||
ExitCodes.FailedLimit,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (interrupt) {
|
||||
this.uploadAndDeleteLocal = true;
|
||||
this.gracefulFinishOnInterrupt();
|
||||
this.gracefulFinishOnInterrupt(interrupt);
|
||||
}
|
||||
}
|
||||
|
||||
gracefulFinishOnInterrupt() {
|
||||
this.interrupted = true;
|
||||
gracefulFinishOnInterrupt(interruptReason: InterruptReason) {
|
||||
this.interruptReason = interruptReason;
|
||||
logger.info("Crawler interrupted, gracefully finishing current pages");
|
||||
if (!this.params.waitOnDone && !this.params.restartsOnError) {
|
||||
this.finalExit = true;
|
||||
|
@ -1457,23 +1477,25 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async serializeAndExit() {
|
||||
await this.serializeConfig();
|
||||
|
||||
if (this.interrupted) {
|
||||
await this.browser.close();
|
||||
if (this.interruptReason) {
|
||||
await closeWorkers(0);
|
||||
await this.browser.close();
|
||||
await this.closeFiles();
|
||||
|
||||
if (!this.done) {
|
||||
await this.setStatusAndExit(
|
||||
ExitCodes.InterruptedImmediate,
|
||||
ExitCodes.SignalInterruptedForce,
|
||||
"interrupted",
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
await this.setStatusAndExit(ExitCodes.Success, "done");
|
||||
}
|
||||
|
||||
async isCrawlRunning() {
|
||||
if (this.interrupted) {
|
||||
if (this.interruptReason) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1495,6 +1517,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
this.healthChecker = new HealthChecker(
|
||||
this.params.healthCheckPort,
|
||||
this.params.workers,
|
||||
this.browser,
|
||||
async () => {
|
||||
await this.updateCurrSize();
|
||||
},
|
||||
|
@ -1726,7 +1749,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (
|
||||
this.params.generateWACZ &&
|
||||
!this.params.dryRun &&
|
||||
(!this.interrupted || this.finalExit || this.uploadAndDeleteLocal)
|
||||
(!this.interruptReason || this.finalExit || this.uploadAndDeleteLocal)
|
||||
) {
|
||||
const uploaded = await this.generateWACZ();
|
||||
|
||||
|
@ -1742,7 +1765,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
|
||||
if (this.params.waitOnDone && (!this.interruptReason || this.finalExit)) {
|
||||
this.done = true;
|
||||
logger.info("All done, waiting for signal...");
|
||||
await this.crawlState.setStatus("done");
|
||||
|
@ -1753,11 +1776,8 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
markBrowserCrashed() {
|
||||
this.interrupted = true;
|
||||
this.browserCrashed = true;
|
||||
if (this.healthChecker) {
|
||||
this.healthChecker.browserCrashed = true;
|
||||
}
|
||||
this.interruptReason = InterruptReason.BrowserCrashed;
|
||||
this.browser.crashed = true;
|
||||
}
|
||||
|
||||
async closeLog(): Promise<void> {
|
||||
|
|
|
@ -5,7 +5,7 @@ import { setExitOnRedisError } from "./util/redis.js";
|
|||
import { Crawler } from "./crawler.js";
|
||||
import { ReplayCrawler } from "./replaycrawler.js";
|
||||
import fs from "node:fs";
|
||||
import { ExitCodes } from "./util/constants.js";
|
||||
import { ExitCodes, InterruptReason } from "./util/constants.js";
|
||||
|
||||
let crawler: Crawler | null = null;
|
||||
|
||||
|
@ -29,9 +29,9 @@ async function handleTerminate(signame: string) {
|
|||
try {
|
||||
await crawler.checkCanceled();
|
||||
|
||||
if (!crawler.interrupted) {
|
||||
logger.info("SIGNAL: gracefully finishing current pages...");
|
||||
crawler.gracefulFinishOnInterrupt();
|
||||
if (!crawler.interruptReason) {
|
||||
logger.info("SIGNAL: interrupt request received...");
|
||||
crawler.gracefulFinishOnInterrupt(InterruptReason.SignalInterrupted);
|
||||
} else if (forceTerm || Date.now() - lastSigInt > 200) {
|
||||
logger.info("SIGNAL: stopping crawl now...");
|
||||
await crawler.serializeAndExit();
|
||||
|
|
|
@ -9,7 +9,11 @@ import path from "path";
|
|||
import { formatErr, LogContext, logger } from "./logger.js";
|
||||
import { initStorage } from "./storage.js";
|
||||
|
||||
import { DISPLAY, type ServiceWorkerOpt } from "./constants.js";
|
||||
import {
|
||||
DISPLAY,
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
type ServiceWorkerOpt,
|
||||
} from "./constants.js";
|
||||
|
||||
import puppeteer, {
|
||||
Frame,
|
||||
|
@ -20,6 +24,7 @@ import puppeteer, {
|
|||
} from "puppeteer-core";
|
||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||
import { Recorder } from "./recorder.js";
|
||||
import { timedRun } from "./timing.js";
|
||||
|
||||
type BtrixChromeOpts = {
|
||||
proxy?: string;
|
||||
|
@ -35,6 +40,7 @@ type LaunchOpts = {
|
|||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
emulateDevice?: Record<string, any>;
|
||||
|
||||
ondisconnect?: ((err: unknown) => NonNullable<unknown>) | null;
|
||||
|
||||
swOpt?: ServiceWorkerOpt;
|
||||
|
@ -61,6 +67,8 @@ export class Browser {
|
|||
|
||||
swOpt?: ServiceWorkerOpt = "disabled";
|
||||
|
||||
crashed = false;
|
||||
|
||||
constructor() {
|
||||
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
}
|
||||
|
@ -364,9 +372,24 @@ export class Browser {
|
|||
}
|
||||
|
||||
async close() {
|
||||
if (this.browser) {
|
||||
if (!this.browser) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.crashed) {
|
||||
this.browser.removeAllListeners("disconnected");
|
||||
await this.browser.close();
|
||||
try {
|
||||
await timedRun(
|
||||
this.browser.close(),
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
"Closing Browser Timed Out",
|
||||
{},
|
||||
"browser",
|
||||
true,
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
this.browser = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,8 +63,21 @@ export enum ExitCodes {
|
|||
Failed = 9,
|
||||
OutOfSpace = 3,
|
||||
BrowserCrashed = 10,
|
||||
InterruptedGraceful = 11,
|
||||
InterruptedImmediate = 13,
|
||||
SignalInterrupted = 11,
|
||||
FailedLimit = 12,
|
||||
SignalInterruptedForce = 13,
|
||||
SizeLimit = 14,
|
||||
TimeLimit = 15,
|
||||
DiskUtilization = 16,
|
||||
Fatal = 17,
|
||||
ProxyError = 21,
|
||||
}
|
||||
|
||||
export enum InterruptReason {
|
||||
SizeLimit = 1,
|
||||
TimeLimit = 2,
|
||||
FailedLimit = 3,
|
||||
DiskUtilization = 4,
|
||||
BrowserCrashed = 5,
|
||||
SignalInterrupted = 6,
|
||||
}
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
import http from "http";
|
||||
import url from "url";
|
||||
import { logger } from "./logger.js";
|
||||
import { Browser } from "./browser.js";
|
||||
|
||||
// ===========================================================================
|
||||
export class HealthChecker {
|
||||
port: number;
|
||||
errorThreshold: number;
|
||||
healthServer: http.Server;
|
||||
browserCrashed = false;
|
||||
browser: Browser;
|
||||
|
||||
updater: (() => Promise<void>) | null;
|
||||
|
||||
|
@ -16,9 +17,11 @@ export class HealthChecker {
|
|||
constructor(
|
||||
port: number,
|
||||
errorThreshold: number,
|
||||
browser: Browser,
|
||||
updater: (() => Promise<void>) | null = null,
|
||||
) {
|
||||
this.port = port;
|
||||
this.browser = browser;
|
||||
this.errorThreshold = errorThreshold;
|
||||
|
||||
this.healthServer = http.createServer((...args) =>
|
||||
|
@ -34,7 +37,7 @@ export class HealthChecker {
|
|||
const pathname = req.url ? url.parse(req.url).pathname : "";
|
||||
switch (pathname) {
|
||||
case "/healthz":
|
||||
if (this.errorCount < this.errorThreshold && !this.browserCrashed) {
|
||||
if (this.errorCount < this.errorThreshold && !this.browser.crashed) {
|
||||
logger.debug(
|
||||
`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`,
|
||||
{},
|
||||
|
|
|
@ -997,7 +997,7 @@ export class Recorder {
|
|||
while (
|
||||
numPending &&
|
||||
!this.pageFinished &&
|
||||
!this.crawler.interrupted &&
|
||||
!this.crawler.interruptReason &&
|
||||
!this.crawler.postCrawling
|
||||
) {
|
||||
pending = [];
|
||||
|
|
|
@ -7,7 +7,6 @@ import { rxEscape } from "./seeds.js";
|
|||
import { CDPSession, Page } from "puppeteer-core";
|
||||
import { PageState, WorkerId } from "./state.js";
|
||||
import { Crawler } from "../crawler.js";
|
||||
import { PAGE_OP_TIMEOUT_SECS } from "./constants.js";
|
||||
|
||||
const MAX_REUSE = 5;
|
||||
|
||||
|
@ -233,8 +232,7 @@ export class PageWorker {
|
|||
}
|
||||
|
||||
if (retry >= MAX_REUSE) {
|
||||
this.crawler.browserCrashed = true;
|
||||
this.crawler.interrupted = true;
|
||||
this.crawler.markBrowserCrashed();
|
||||
throw new Error("Unable to load new page, browser needs restart");
|
||||
}
|
||||
|
||||
|
@ -433,16 +431,7 @@ export async function runWorkers(
|
|||
|
||||
await closeWorkers();
|
||||
|
||||
if (!crawler.browserCrashed) {
|
||||
await timedRun(
|
||||
crawler.browser.close(),
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
"Closing Browser Timed Out",
|
||||
{},
|
||||
"worker",
|
||||
true,
|
||||
);
|
||||
}
|
||||
await crawler.browser.close();
|
||||
}
|
||||
|
||||
// ===========================================================================
|
||||
|
|
|
@ -25,7 +25,46 @@ test("ensure crawl fails if failOnFailedLimit is reached", async () => {
|
|||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/will404 --url https://specs.webrecorder.net --failOnInvalidStatus --failOnFailedLimit 1 --limit 10 --collection faillimitreached",
|
||||
);
|
||||
} catch (error) {
|
||||
expect(error.code).toEqual(17);
|
||||
expect(error.code).toEqual(12);
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
});
|
||||
|
||||
test("ensure crawl fails if timeLimit is reached", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --timeLimit 1 --limit 2 --collection failontimelimitreached",
|
||||
);
|
||||
} catch (error) {
|
||||
expect(error.code).toEqual(15);
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
});
|
||||
|
||||
test("ensure crawl fails if sizeLimit is reached", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --sizeLimit 1 --limit 2 --collection failonsizelimitreached",
|
||||
);
|
||||
} catch (error) {
|
||||
expect(error.code).toEqual(14);
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
});
|
||||
|
||||
test("ensure crawl fails if diskUtilizationLimit is reached", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --diskUtilization 1 --limit 2 --collection failonsizelimitreached",
|
||||
);
|
||||
} catch (error) {
|
||||
expect(error.code).toEqual(16);
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue