mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
profiles:
- add our own signal handling to create-login-profile to ensure fast exit in k8s - print crawler version info string on startup
This commit is contained in:
parent
f96c6a13dc
commit
b57dea50b5
3 changed files with 32 additions and 19 deletions
|
@ -35,7 +35,7 @@ import { initRedis } from "./util/redis.js";
|
||||||
import { logger, formatErr } from "./util/logger.js";
|
import { logger, formatErr } from "./util/logger.js";
|
||||||
import { WorkerOpts, WorkerState, runWorkers } from "./util/worker.js";
|
import { WorkerOpts, WorkerState, runWorkers } from "./util/worker.js";
|
||||||
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
|
||||||
import { collectAllFileSources } from "./util/file_reader.js";
|
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
|
||||||
|
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
|
|
||||||
|
@ -428,7 +428,7 @@ export class Crawler {
|
||||||
this.logFH = fs.createWriteStream(this.logFilename);
|
this.logFH = fs.createWriteStream(this.logFilename);
|
||||||
logger.setExternalLogStream(this.logFH);
|
logger.setExternalLogStream(this.logFH);
|
||||||
|
|
||||||
this.infoString = await this.getInfoString();
|
this.infoString = await getInfoString();
|
||||||
logger.info(this.infoString);
|
logger.info(this.infoString);
|
||||||
|
|
||||||
logger.info("Seeds", this.params.scopedSeeds);
|
logger.info("Seeds", this.params.scopedSeeds);
|
||||||
|
@ -1008,22 +1008,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
return res ? frame : null;
|
return res ? frame : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async getInfoString() {
|
|
||||||
const packageFileJSON = JSON.parse(
|
|
||||||
await fsp.readFile(new URL("../package.json", import.meta.url), {
|
|
||||||
encoding: "utf-8",
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
const warcioPackageJSON = JSON.parse(
|
|
||||||
await fsp.readFile(
|
|
||||||
new URL("../node_modules/warcio/package.json", import.meta.url),
|
|
||||||
{ encoding: "utf-8" },
|
|
||||||
),
|
|
||||||
);
|
|
||||||
|
|
||||||
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
|
||||||
}
|
|
||||||
|
|
||||||
async createWARCInfo(filename: string) {
|
async createWARCInfo(filename: string) {
|
||||||
const warcVersion = "WARC/1.0";
|
const warcVersion = "WARC/1.0";
|
||||||
const type = "warcinfo";
|
const type = "warcinfo";
|
||||||
|
|
|
@ -14,6 +14,7 @@ import { logger } from "./util/logger.js";
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
import { initStorage } from "./util/storage.js";
|
import { initStorage } from "./util/storage.js";
|
||||||
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||||
|
import { getInfoString } from "./util/file_reader.js";
|
||||||
|
|
||||||
const profileHTML = fs.readFileSync(
|
const profileHTML = fs.readFileSync(
|
||||||
new URL("../html/createProfile.html", import.meta.url),
|
new URL("../html/createProfile.html", import.meta.url),
|
||||||
|
@ -118,6 +119,11 @@ function getDefaultWindowSize() {
|
||||||
return `${x},${y}`;
|
return `${x},${y}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function handleTerminate(signame: string) {
|
||||||
|
logger.info(`Got signal ${signame}, exiting`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const params: any = yargs(process.argv)
|
const params: any = yargs(process.argv)
|
||||||
|
@ -126,6 +132,12 @@ async function main() {
|
||||||
|
|
||||||
logger.setDebugLogging(true);
|
logger.setDebugLogging(true);
|
||||||
|
|
||||||
|
logger.info(await getInfoString());
|
||||||
|
|
||||||
|
process.on("SIGINT", () => handleTerminate("SIGINT"));
|
||||||
|
|
||||||
|
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
|
||||||
|
|
||||||
if (!params.headless) {
|
if (!params.headless) {
|
||||||
logger.debug("Launching XVFB");
|
logger.debug("Launching XVFB");
|
||||||
child_process.spawn("Xvfb", [
|
child_process.spawn("Xvfb", [
|
||||||
|
@ -164,7 +176,7 @@ async function main() {
|
||||||
await browser.launch({
|
await browser.launch({
|
||||||
profileUrl: params.profile,
|
profileUrl: params.profile,
|
||||||
headless: params.headless,
|
headless: params.headless,
|
||||||
signals: true,
|
signals: false,
|
||||||
chromeOptions: {
|
chromeOptions: {
|
||||||
proxy: false,
|
proxy: false,
|
||||||
extraArgs: [
|
extraArgs: [
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
import fsp from "fs/promises";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
|
|
||||||
const MAX_DEPTH = 2;
|
const MAX_DEPTH = 2;
|
||||||
|
@ -48,3 +49,19 @@ export function collectAllFileSources(
|
||||||
|
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function getInfoString() {
|
||||||
|
const packageFileJSON = JSON.parse(
|
||||||
|
await fsp.readFile(new URL("../../package.json", import.meta.url), {
|
||||||
|
encoding: "utf-8",
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
const warcioPackageJSON = JSON.parse(
|
||||||
|
await fsp.readFile(
|
||||||
|
new URL("../../node_modules/warcio/package.json", import.meta.url),
|
||||||
|
{ encoding: "utf-8" },
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue