browsertrix-crawler/main.js
Ilya Kreymer cf90304fa7
0.6.0 Wait State + Screencasting Fixes (#141)
* new options:
- to support browsertrix-cloud, add a --waitOnDone option, which has browsertrix crawler wait when finished 
- when running with redis shared state, set the `<crawl id>:status` field to `running`, `failing`, `failed` or `done` to let job controller know crawl is finished.
- set redis state to `failing` in case of exception, set to `failed` in case of >3 or more failed exits within 60 seconds (todo: make customizable)
- when receiving a SIGUSR1, assume final shutdown and finalize files (eg. save WACZ) before exiting.
- also write WACZ if exiting due to size limit exceed, but not do to other interruptions
- change sleep() to be in seconds

* misc fixes:
- crawlstate.finished() -> isFinished() - return if >0 pages and none left in queue
- don't fail crawl if isFinished() is true
- don't keep looping in pending wait for urls to finish if received abort request

* screencast improvements (fix related to webrecorder/browsertrix-cloud#233)
- more optimized screencasting, don't close and restart after every page.
- don't assume targets change after every page, they don't in window mode!
- only send 'close' message when target is actually closed

* bump to 0.6.0
2022-06-17 11:58:44 -07:00

64 lines
1.2 KiB
JavaScript
Executable file

#!/usr/bin/env node
var crawler = null;
var lastSigInt = 0;
let forceTerm = false;
async function handleTerminate() {
if (!crawler || !crawler.crawlState) {
process.exit(0);
}
try {
if (!crawler.crawlState.drainMax) {
console.log("SIGNAL: gracefully finishing current pages...");
crawler.crawlState.setDrain(crawler.finalExit);
} else if ((Date.now() - lastSigInt) > 200) {
console.log("SIGNAL: stopping crawl now...");
await crawler.serializeConfig();
process.exit(0);
}
lastSigInt = Date.now();
} catch (e) {
console.log(e);
}
}
process.on("SIGINT", async () => {
console.log("SIGINT received...");
await handleTerminate();
});
process.on("SIGUSR1", () => {
if (crawler) {
crawler.finalExit = true;
}
});
process.on("SIGTERM", async () => {
if (forceTerm || crawler.done) {
console.log("SIGTERM received, exit immediately");
process.exit(crawler.done ? 0 : 1);
}
console.log("SIGTERM received...");
await handleTerminate();
});
process.on("SIGABRT", async () => {
console.log("SIGABRT received, will force immediate exit on SIGTERM");
forceTerm = true;
crawler.exitCode = 1;
});
const { Crawler } = require("./crawler");
crawler = new Crawler();
crawler.run();