browsertrix-crawler/util/redis.js

import Redis from "ioredis";
import { logger } from "./logger.js";

const error = console.error;

let lastLogTime = 0;
let exitOnError = false;

// log only once every 10 seconds
const REDIS_ERROR_LOG_INTERVAL_SECS = 10000;

console.error = function (...args) {
  if (
    typeof args[0] === "string" &&
    args[0].indexOf("[ioredis] Unhandled error event") === 0
  ) {

    let now = Date.now();

    if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
      if (lastLogTime && exitOnError) {
        logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
      }
      logger.warn("ioredis error", {error: args[0]}, "redis");
      lastLogTime = now;
    }
    return;
  }
  error.call(console, ...args);
};

export async function initRedis(url) {
  const redis = new Redis(url, {lazyConnect: true});
  await redis.connect();
  return redis;
}

export function setExitOnRedisError() {
  exitOnError = true;
}
Convert to ESM (#179) * switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0 2022-10-24 15:30:10 +02:00			`import Redis from "ioredis";`
Logger cleanup (#254) * logging: convert logger to a singleton to simplify use * add logger to create-login-profile.js 2023-03-17 14:24:44 -07:00			`import { logger } from "./logger.js";`
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2023-03-17 12:50:32 -07:00
			`const error = console.error;`

			`let lastLogTime = 0;`
optimize shutdown: if after interrupt signal was received, redis connection is gone, assume crawler is being terminated and exit quickly, (#292) don't attemtpt to reconnect to redis (assume crawler is also being shutdown) 2023-04-24 09:50:49 -07:00			`let exitOnError = false;`
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2023-03-17 12:50:32 -07:00
			`// log only once every 10 seconds`
			`const REDIS_ERROR_LOG_INTERVAL_SECS = 10000;`

			`console.error = function (...args) {`
			`if (`
			`typeof args[0] === "string" &&`
			`args[0].indexOf("[ioredis] Unhandled error event") === 0`
			`) {`

			`let now = Date.now();`

			`if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {`
optimize shutdown: if after interrupt signal was received, redis connection is gone, assume crawler is being terminated and exit quickly, (#292) don't attemtpt to reconnect to redis (assume crawler is also being shutdown) 2023-04-24 09:50:49 -07:00			`if (lastLogTime && exitOnError) {`
			`logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");`
			`}`
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2023-03-17 12:50:32 -07:00			`logger.warn("ioredis error", {error: args[0]}, "redis");`
			`lastLogTime = now;`
			`}`
			`return;`
			`}`
			`error.call(console, ...args);`
			`};`
Support for uploading to S3 (#95) - support uploading WACZ to s3-compatible storage (via minio client) - config storage loaded from env vars, enabled when WACZ output is used. - support pinging either or an http or a redis key-based webhook, - webhook: include 'completed' bool to indicate if fully completed crawl or partial (eg. interrupted via signal) - consolidate redis init to redis.js - support upload filename with custom variables: can interpolate current timestamp (@ts), hostname (@hostname) and user provided id (@crawlId) - README: add docs for s3 storage, remove unused args - update to pywb 2.6.2, browsertrix-behaviors 0.2.4 * fix to `limit` option, ensure limit check uses shared state * bump version to 0.5.0-beta.1 2021-11-23 12:53:30 -08:00
Convert to ESM (#179) * switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0 2022-10-24 15:30:10 +02:00			`export async function initRedis(url) {`
Support for uploading to S3 (#95) - support uploading WACZ to s3-compatible storage (via minio client) - config storage loaded from env vars, enabled when WACZ output is used. - support pinging either or an http or a redis key-based webhook, - webhook: include 'completed' bool to indicate if fully completed crawl or partial (eg. interrupted via signal) - consolidate redis init to redis.js - support upload filename with custom variables: can interpolate current timestamp (@ts), hostname (@hostname) and user provided id (@crawlId) - README: add docs for s3 storage, remove unused args - update to pywb 2.6.2, browsertrix-behaviors 0.2.4 * fix to `limit` option, ensure limit check uses shared state * bump version to 0.5.0-beta.1 2021-11-23 12:53:30 -08:00			`const redis = new Redis(url, {lazyConnect: true});`
			`await redis.connect();`
			`return redis;`
Convert to ESM (#179) * switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0 2022-10-24 15:30:10 +02:00			`}`
optimize shutdown: if after interrupt signal was received, redis connection is gone, assume crawler is being terminated and exit quickly, (#292) don't attemtpt to reconnect to redis (assume crawler is also being shutdown) 2023-04-24 09:50:49 -07:00
			`export function setExitOnRedisError() {`
			`exitOnError = true;`
			`}`