profiles:

- add our own signal handling to create-login-profile to ensure fast exit in k8s
- print crawler version info string on startup
This commit is contained in:
Ilya Kreymer 2024-03-16 16:19:42 -07:00
parent f96c6a13dc
commit b57dea50b5
3 changed files with 32 additions and 19 deletions

View file

@ -35,7 +35,7 @@ import { initRedis } from "./util/redis.js";
import { logger, formatErr } from "./util/logger.js";
import { WorkerOpts, WorkerState, runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources } from "./util/file_reader.js";
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
import { Browser } from "./util/browser.js";
@ -428,7 +428,7 @@ export class Crawler {
this.logFH = fs.createWriteStream(this.logFilename);
logger.setExternalLogStream(this.logFH);
this.infoString = await this.getInfoString();
this.infoString = await getInfoString();
logger.info(this.infoString);
logger.info("Seeds", this.params.scopedSeeds);
@ -1008,22 +1008,6 @@ self.__bx_behaviors.selectMainBehavior();
return res ? frame : null;
}
async getInfoString() {
const packageFileJSON = JSON.parse(
await fsp.readFile(new URL("../package.json", import.meta.url), {
encoding: "utf-8",
}),
);
const warcioPackageJSON = JSON.parse(
await fsp.readFile(
new URL("../node_modules/warcio/package.json", import.meta.url),
{ encoding: "utf-8" },
),
);
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
}
async createWARCInfo(filename: string) {
const warcVersion = "WARC/1.0";
const type = "warcinfo";

View file

@ -14,6 +14,7 @@ import { logger } from "./util/logger.js";
import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js";
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
import { getInfoString } from "./util/file_reader.js";
const profileHTML = fs.readFileSync(
new URL("../html/createProfile.html", import.meta.url),
@ -118,6 +119,11 @@ function getDefaultWindowSize() {
return `${x},${y}`;
}
function handleTerminate(signame: string) {
logger.info(`Got signal ${signame}, exiting`);
process.exit(1);
}
async function main() {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const params: any = yargs(process.argv)
@ -126,6 +132,12 @@ async function main() {
logger.setDebugLogging(true);
logger.info(await getInfoString());
process.on("SIGINT", () => handleTerminate("SIGINT"));
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
if (!params.headless) {
logger.debug("Launching XVFB");
child_process.spawn("Xvfb", [
@ -164,7 +176,7 @@ async function main() {
await browser.launch({
profileUrl: params.profile,
headless: params.headless,
signals: true,
signals: false,
chromeOptions: {
proxy: false,
extraArgs: [

View file

@ -1,4 +1,5 @@
import fs from "fs";
import fsp from "fs/promises";
import path from "path";
const MAX_DEPTH = 2;
@ -48,3 +49,19 @@ export function collectAllFileSources(
return [];
}
export async function getInfoString() {
const packageFileJSON = JSON.parse(
await fsp.readFile(new URL("../../package.json", import.meta.url), {
encoding: "utf-8",
}),
);
const warcioPackageJSON = JSON.parse(
await fsp.readFile(
new URL("../../node_modules/warcio/package.json", import.meta.url),
{ encoding: "utf-8" },
),
);
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
}