browsertrix-crawler/main.js
Ilya Kreymer 65933c6b12
Interrupt Handling Fixes (#167)
* interrupts: simplify interrupt behavior:
- SIGTERM/SIGINT behave same way, trigger an graceful shutdown after page load

improvements of remote state / parallel crawlers (for browsertrix-cloud):
- SIGUSR1 before SIGINT/SIGTERM ensures data is saved, mark crawler as done - for use with graceful stopping crawl
- SIGUSR2 before SIGINT/SIGTERM ensures data is saved, does not mark crawler as done - for use with scaling down a single crawler

* scope check: check scope of URL retrieved from queue (in case scoping rules changed), urls matching seed automatically in scope!
2022-09-20 17:09:52 -07:00

64 lines
1.3 KiB
JavaScript
Executable file

#!/usr/bin/env node
var crawler = null;
var lastSigInt = 0;
let forceTerm = false;
async function handleTerminate(signame) {
console.log(`${signame} received...`);
if (!crawler || !crawler.crawlState) {
console.log("error: no crawler running, exiting");
process.exit(1);
}
if (crawler.done) {
console.log("success: crawler done, exiting");
process.exit(0);
}
try {
if (!crawler.crawlState.drainMax) {
console.log("SIGNAL: gracefully finishing current pages...");
crawler.gracefulFinish();
} else if (forceTerm || (Date.now() - lastSigInt) > 200) {
console.log("SIGNAL: stopping crawl now...");
await crawler.serializeAndExit();
}
lastSigInt = Date.now();
} catch (e) {
console.log(e);
}
}
process.on("SIGINT", () => handleTerminate("SIGINT"));
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
process.on("SIGABRT", async () => {
console.log("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
forceTerm = true;
});
process.on("SIGUSR1", () => {
if (crawler) {
crawler.prepareForExit(true);
}
});
process.on("SIGUSR2", () => {
if (crawler) {
crawler.prepareForExit(false);
}
});
const { Crawler } = require("./crawler");
crawler = new Crawler();
crawler.run();