mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
add locking mechanism for deleteOnExit and runnning post-crawl steps
only run postcrawl steps on last instance, when all locks released
This commit is contained in:
parent
59c1bd5626
commit
c0508d44a7
2 changed files with 16 additions and 3 deletions
17
crawler.js
17
crawler.js
|
@ -29,6 +29,7 @@ const { S3StorageSync, getFileSize } = require("./util/storage");
|
|||
const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/screencaster");
|
||||
const { parseArgs } = require("./util/argParser");
|
||||
const { initRedis } = require("./util/redis");
|
||||
const { Lock } = require("./util/lock");
|
||||
|
||||
const { getBrowserExe, loadProfile, evaluateWithCLI } = require("./util/browser");
|
||||
|
||||
|
@ -89,6 +90,9 @@ class Crawler {
|
|||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
this.blockRules = null;
|
||||
|
||||
// lock is obtained if last/single process in parallel runs
|
||||
this.lock = this.params.deleteOnExit ? new Lock(this.collDir) : null;
|
||||
}
|
||||
|
||||
statusLog(...args) {
|
||||
|
@ -217,9 +221,13 @@ class Crawler {
|
|||
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts));
|
||||
|
||||
process.on("exit", (code) => {
|
||||
if (this.params.deleteOnExit && (code === 0 || code === 1)) {
|
||||
if (this.params.deleteOnExit && this.lock.release() && (code === 0 || code === 3)) {
|
||||
console.log(`Deleting ${this.collDir} before exit`);
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
try {
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
} catch(e) {
|
||||
console.warn(e);
|
||||
}
|
||||
}
|
||||
|
||||
for (const proc of subprocesses) {
|
||||
|
@ -461,6 +469,11 @@ class Crawler {
|
|||
// extra wait for all resources to land into WARCs
|
||||
await this.awaitPendingClear();
|
||||
|
||||
if (this.lock && !this.lock.release()) {
|
||||
this.statusLog("Exiting, other parallel jobs will finish...");
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.params.combineWARC) {
|
||||
await this.combineWARC();
|
||||
}
|
||||
|
|
2
main.js
2
main.js
|
@ -35,7 +35,7 @@ process.on("SIGINT", async () => {
|
|||
process.on("SIGTERM", async () => {
|
||||
if (forceTerm) {
|
||||
console.log("SIGTERM received, exit immediately");
|
||||
process.exit(1);
|
||||
process.exit(3);
|
||||
}
|
||||
|
||||
console.log("SIGTERM received...");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue