profiles:

- add our own signal handling to create-login-profile to ensure fast exit in k8s
- print crawler version info string on startup
This commit is contained in:
Ilya Kreymer 2024-03-16 16:19:42 -07:00
parent f96c6a13dc
commit b57dea50b5
3 changed files with 32 additions and 19 deletions

View file

@ -35,7 +35,7 @@ import { initRedis } from "./util/redis.js";
import { logger, formatErr } from "./util/logger.js"; import { logger, formatErr } from "./util/logger.js";
import { WorkerOpts, WorkerState, runWorkers } from "./util/worker.js"; import { WorkerOpts, WorkerState, runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources } from "./util/file_reader.js"; import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
import { Browser } from "./util/browser.js"; import { Browser } from "./util/browser.js";
@ -428,7 +428,7 @@ export class Crawler {
this.logFH = fs.createWriteStream(this.logFilename); this.logFH = fs.createWriteStream(this.logFilename);
logger.setExternalLogStream(this.logFH); logger.setExternalLogStream(this.logFH);
this.infoString = await this.getInfoString(); this.infoString = await getInfoString();
logger.info(this.infoString); logger.info(this.infoString);
logger.info("Seeds", this.params.scopedSeeds); logger.info("Seeds", this.params.scopedSeeds);
@ -1008,22 +1008,6 @@ self.__bx_behaviors.selectMainBehavior();
return res ? frame : null; return res ? frame : null;
} }
async getInfoString() {
const packageFileJSON = JSON.parse(
await fsp.readFile(new URL("../package.json", import.meta.url), {
encoding: "utf-8",
}),
);
const warcioPackageJSON = JSON.parse(
await fsp.readFile(
new URL("../node_modules/warcio/package.json", import.meta.url),
{ encoding: "utf-8" },
),
);
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
}
async createWARCInfo(filename: string) { async createWARCInfo(filename: string) {
const warcVersion = "WARC/1.0"; const warcVersion = "WARC/1.0";
const type = "warcinfo"; const type = "warcinfo";

View file

@ -14,6 +14,7 @@ import { logger } from "./util/logger.js";
import { Browser } from "./util/browser.js"; import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js"; import { initStorage } from "./util/storage.js";
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core"; import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
import { getInfoString } from "./util/file_reader.js";
const profileHTML = fs.readFileSync( const profileHTML = fs.readFileSync(
new URL("../html/createProfile.html", import.meta.url), new URL("../html/createProfile.html", import.meta.url),
@ -118,6 +119,11 @@ function getDefaultWindowSize() {
return `${x},${y}`; return `${x},${y}`;
} }
function handleTerminate(signame: string) {
logger.info(`Got signal ${signame}, exiting`);
process.exit(1);
}
async function main() { async function main() {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
const params: any = yargs(process.argv) const params: any = yargs(process.argv)
@ -126,6 +132,12 @@ async function main() {
logger.setDebugLogging(true); logger.setDebugLogging(true);
logger.info(await getInfoString());
process.on("SIGINT", () => handleTerminate("SIGINT"));
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
if (!params.headless) { if (!params.headless) {
logger.debug("Launching XVFB"); logger.debug("Launching XVFB");
child_process.spawn("Xvfb", [ child_process.spawn("Xvfb", [
@ -164,7 +176,7 @@ async function main() {
await browser.launch({ await browser.launch({
profileUrl: params.profile, profileUrl: params.profile,
headless: params.headless, headless: params.headless,
signals: true, signals: false,
chromeOptions: { chromeOptions: {
proxy: false, proxy: false,
extraArgs: [ extraArgs: [

View file

@ -1,4 +1,5 @@
import fs from "fs"; import fs from "fs";
import fsp from "fs/promises";
import path from "path"; import path from "path";
const MAX_DEPTH = 2; const MAX_DEPTH = 2;
@ -48,3 +49,19 @@ export function collectAllFileSources(
return []; return [];
} }
export async function getInfoString() {
const packageFileJSON = JSON.parse(
await fsp.readFile(new URL("../../package.json", import.meta.url), {
encoding: "utf-8",
}),
);
const warcioPackageJSON = JSON.parse(
await fsp.readFile(
new URL("../../node_modules/warcio/package.json", import.meta.url),
{ encoding: "utf-8" },
),
);
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
}