mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

* interrupts: simplify interrupt behavior: - SIGTERM/SIGINT behave same way, trigger an graceful shutdown after page load improvements of remote state / parallel crawlers (for browsertrix-cloud): - SIGUSR1 before SIGINT/SIGTERM ensures data is saved, mark crawler as done - for use with graceful stopping crawl - SIGUSR2 before SIGINT/SIGTERM ensures data is saved, does not mark crawler as done - for use with scaling down a single crawler * scope check: check scope of URL retrieved from queue (in case scoping rules changed), urls matching seed automatically in scope!
64 lines
1.3 KiB
JavaScript
Executable file
64 lines
1.3 KiB
JavaScript
Executable file
#!/usr/bin/env node
|
|
|
|
var crawler = null;
|
|
|
|
var lastSigInt = 0;
|
|
let forceTerm = false;
|
|
|
|
|
|
async function handleTerminate(signame) {
|
|
console.log(`${signame} received...`);
|
|
if (!crawler || !crawler.crawlState) {
|
|
console.log("error: no crawler running, exiting");
|
|
process.exit(1);
|
|
}
|
|
|
|
if (crawler.done) {
|
|
console.log("success: crawler done, exiting");
|
|
process.exit(0);
|
|
}
|
|
|
|
try {
|
|
if (!crawler.crawlState.drainMax) {
|
|
console.log("SIGNAL: gracefully finishing current pages...");
|
|
crawler.gracefulFinish();
|
|
|
|
} else if (forceTerm || (Date.now() - lastSigInt) > 200) {
|
|
console.log("SIGNAL: stopping crawl now...");
|
|
await crawler.serializeAndExit();
|
|
}
|
|
lastSigInt = Date.now();
|
|
} catch (e) {
|
|
console.log(e);
|
|
}
|
|
}
|
|
|
|
process.on("SIGINT", () => handleTerminate("SIGINT"));
|
|
|
|
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
|
|
|
|
process.on("SIGABRT", async () => {
|
|
console.log("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
|
|
forceTerm = true;
|
|
});
|
|
|
|
process.on("SIGUSR1", () => {
|
|
if (crawler) {
|
|
crawler.prepareForExit(true);
|
|
}
|
|
});
|
|
|
|
process.on("SIGUSR2", () => {
|
|
if (crawler) {
|
|
crawler.prepareForExit(false);
|
|
}
|
|
});
|
|
|
|
|
|
|
|
const { Crawler } = require("./crawler");
|
|
|
|
crawler = new Crawler();
|
|
crawler.run();
|
|
|
|
|