Add option for sleep interval after behaviors run + timing cleanup (#257)

* Add --pageExtraDelay option to add extra delay/wait time after every page (fixes #131) * Store total page time in 'maxPageTime', include pageExtraDelay * Rename timeout->pageLoadTimeout * cleanup: - store seconds for most interval checks, convert to ms only for api calls, remove most sec<->ms conversions - add secondsElapsed() utility function to help checking time elapsed - cleanup comments --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-10-19 14:33:17 +00:00 · 2023-03-22 14:50:18 -04:00 · 2023-03-22 14:50:18 -04:00 · b0e93cb06e
commit b0e93cb06e
parent 02fb137b2c
6 changed files with 72 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -68,13 +68,12 @@ Browsertrix Crawler includes a number of additional command-line options, explai
      --crawlId, --id                       A user provided ID for this crawl or
                                             crawl configuration (can also be se
                                            t via CRAWL_ID env var)
-                                              [string] [default: "06bf9a4df9f7"]
+                                              [string] [default: "ce75810e6874"]
      --newContext                          Deprecated as of 0.8.0, any values p
                                            assed will be ignored
                                                        [string] [default: null]
-      --waitUntil                           Puppeteer page.goto() condition to w
-                                            ait for before continuing, can be mu
-                                            ltiple separate by ','
+      --waitUntil                           Playwright page.goto() condition to
+                                            wait for before continuing
                                                               [default: "load"]
      --depth                               The depth of the crawl for all seeds
                                                          [number] [default: -1]
@ -83,11 +82,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
                                                           [number] [default: 0]
      --limit                               Limit crawl to this number of pages
                                                           [number] [default: 0]
-      --timeout                             Timeout for each page to load (in se
+      --pageLoadTimeout, --timeout          Timeout for each page to load (in se
                                            conds)        [number] [default: 90]
      --scopeType                           A predefined scope of the crawl. For
-                                            more customization, use 'custom' and
-                                             set scopeIncludeRx regexes
+                                             more customization, use 'custom' an
+                                            d set scopeIncludeRx regexes
  [string] [choices: "page", "page-spa", "prefix", "host", "domain", "any", "cus
                                                                           tom"]
      --scopeIncludeRx, --include           Regex of page URLs that should be in
@ -131,19 +130,20 @@ Browsertrix Crawler includes a number of additional command-line options, explai
      --generateWACZ, --generatewacz, --ge  If set, generate wacz
      nerateWacz                                      [boolean] [default: false]
      --logging                             Logging options for crawler, can inc
-                                            lude: stats, pywb, behaviors, behavi
-                                            ors-debug, jserrors
+                                            lude: stats (enabled by default), js
+                                            errors, pywb, debug
                                                     [string] [default: "stats"]
      --text                                If set, extract text to the pages.js
-                                            only file  [boolean] [default: false]
+                                            onl file  [boolean] [default: false]
      --cwd                                 Crawl working directory for captures
                                             (pywb root). If not set, defaults t
                                            o process.cwd()
                                                   [string] [default: "/crawls"]
      --mobileDevice                        Emulate mobile device by name from:
-                                            https://github.com/puppeteer/puppete
-                                            er/blob/main/src/common/DeviceDescri
-                                            ptors.ts                    [string]
+                                            https://github.com/microsoft/playwri
+                                            ght/blob/main/packages/playwright-co
+                                            re/src/server/deviceDescriptorsSourc
+                                            e.json                      [string]
      --userAgent                           Override user-agent with specified s
                                            tring                       [string]
      --userAgentSuffix                     Append suffix to existing browser us
@ -162,12 +162,16 @@ Browsertrix Crawler includes a number of additional command-line options, explai
                                            age behavior will run on each page.
                                            If 0, a behavior can run until finis
                                            h.            [number] [default: 90]
+      --pageExtraDelay, --delay             If >0, amount of time to sleep (in s
+                                            econds) after behaviors before movin
+                                            g on to next page
+                                                           [number] [default: 0]
      --profile                             Path to tar.gz file which will be ex
                                            tracted and used as the browser prof
                                            ile                         [string]
      --screenshot                          Screenshot options for crawler, can
-                                            include: view, thumbnail, fullPage
-                                            (comma-separated list)
+                                            include: view, thumbnail, fullPage (
+                                            comma-separated list)
                                                          [string] [default: ""]
      --screencastPort                      If set to a non-zero value, starts a
                                            n HTTP server with screencast access
@ -181,9 +185,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
                                            o record in combined WARCs
      --redisStoreUrl                       If set, url for remote redis server
                                            to store state. Otherwise, using in-
-                                            memory store                [string]
+                                            memory store
+                                  [string] [default: "redis://localhost:6379/0"]
      --saveState                           If the crawl state should be seriali
-                                            zed to the crawls/ directory. Default
+                                            zed to the crawls/ directory. Defaul
                                            ts to 'partial', only saved when cra
                                            wl is interrupted
           [string] [choices: "never", "partial", "always"] [default: "partial"]
@ -212,8 +217,11 @@ Browsertrix Crawler includes a number of additional command-line options, explai
      --netIdleWait                         if set, wait for network idle after
                                            page load and after behaviors are do
                                            ne (in seconds). if -1 (default), de
-                                            determine based on scope
+                                            termine based on scope
                                                          [number] [default: -1]
+      --lang                                if set, sets the language used by th
+                                            e browser, should be ISO 639 languag
+                                            e[-country] code            [string]
      --config                              Path to YAML config file

 ```
--- a/crawler.js
+++ b/crawler.js
@ -20,7 +20,7 @@ import { parseArgs } from "./util/argParser.js";
 import { initRedis } from "./util/redis.js";
 import { logger, errJSON } from "./util/logger.js";
 import { runWorkers } from "./util/worker.js";
-import { sleep, timedRun } from "./util/timing.js";
+import { sleep, timedRun, secondsElapsed } from "./util/timing.js";

 import { Browser } from "./util/browser.js";

@ -76,7 +76,11 @@ export class Crawler {

    this.saveStateFiles = [];
    this.lastSaveTime = 0;
-    this.saveStateInterval = this.params.saveStateInterval * 1000;    
+
+    // sum of page load + behavior timeouts + 2 x fetch + cloudflare + link extraction timeouts + extra page delay
+    // if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck)
+    this.maxPageTime = this.params.pageLoadTimeout + this.params.behaviorTimeout +
+                       FETCH_TIMEOUT_SECS*2 + PAGE_OP_TIMEOUT_SECS*2 + this.params.pageExtraDelay;

    this.emulateDevice = this.params.emulateDevice || {};

@ -85,7 +89,7 @@ export class Crawler {

    this.gotoOpts = {
      waitUntil: this.params.waitUntil,
-      timeout: this.params.timeout
+      timeout: this.params.pageLoadTimeout * 1000
    };

    // pages directory
@ -152,7 +156,9 @@ export class Crawler {

    logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");

-    this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
+    logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
+
+    this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.maxPageTime, os.hostname());

    if (this.params.saveState === "always" && this.params.saveStateInterval) {
      logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
@ -406,11 +412,9 @@ export class Crawler {
      } else if (data.skipBehaviors) {
        logger.info("Skipping behaviors for slow page", logDetails, "behavior");
      } else {
-        const behaviorTimeout = this.params.behaviorTimeout / 1000;
-
        const res = await timedRun(
          this.runBehaviors(page, data.filteredFrames, logDetails),
-          behaviorTimeout,
+          this.params.behaviorTimeout,
          "Behaviors timed out",
          logDetails,
          "behavior"
@ -423,6 +427,11 @@ export class Crawler {
      }
    }

+    if (this.params.pageExtraDelay) {
+      logger.info(`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, logDetails);
+      await sleep(this.params.pageExtraDelay);
+    }
+
    return true;
  }

@ -557,8 +566,8 @@ export class Crawler {
    }

    if (this.params.timeLimit) {
-      const elapsed = (Date.now() - this.startTime) / 1000;
-      if (elapsed > this.params.timeLimit) {
+      const elapsed = secondsElapsed(this.startTime);
+      if (elapsed >= this.params.timeLimit) {
        logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
        interrupt = true;
      }
@ -683,9 +692,10 @@ export class Crawler {
      }
    });

-    const totalPageTimeout = (this.params.behaviorTimeout + this.params.timeout) / 1000 + 60;
-
-    await runWorkers(this, this.params.workers, totalPageTimeout);
+    // --------------
+    // Run Crawl Here!
+    await runWorkers(this, this.params.workers, this.maxPageTime);
+    // --------------

    await this.serializeConfig(true);

@ -1359,7 +1369,7 @@ export class Crawler {

    if (!done) {
      // if not done, save state only after specified interval has elapsed
-      if ((now.getTime() - this.lastSaveTime) < this.saveStateInterval) {
+      if (secondsElapsed(this.lastSaveTime, now) < this.params.saveStateInterval) {
        return;
      }
    }
--- a/util/argParser.js
+++ b/util/argParser.js
@ -74,7 +74,8 @@ class ArgParser {
        type: "number",
      },

-      "timeout": {
+      "pageLoadTimeout": {
+        alias: "timeout",
        describe: "Timeout for each page to load (in seconds)",
        default: 90,
        type: "number",
@ -223,6 +224,13 @@ class ArgParser {
        type: "number",
      },

+      "pageExtraDelay": {
+        alias: "delay",
+        describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
+        default: 0,
+        type: "number",
+      },
+
      "profile": {
        describe: "Path to tar.gz file which will be extracted and used as the browser profile",
        type: "string",
@ -354,10 +362,7 @@ class ArgParser {
      logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
    }

-    argv.timeout *= 1000;
-
-    // waitUntil condition must be: load, domcontentloaded, networkidle
-    // TODO: Playwright migration - for now, can only support one
+    // waitUntil condition must be one of WAIT_UNTIL_OPTS: load, domcontentloaded, networkidle
    // (see: https://playwright.dev/docs/api/class-page#page-goto-option-wait-until)
    if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
      logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
@ -385,9 +390,6 @@ class ArgParser {
      argv.behaviors = argv.behaviors.split(",");
    }
    argv.behaviors.forEach((x) => behaviorOpts[x] = true);
-    if (argv.behaviorTimeout) {
-      behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
-    }
    behaviorOpts.log = BEHAVIOR_LOG_FUNC;
    argv.behaviorOpts = JSON.stringify(behaviorOpts);

--- a/util/state.js
+++ b/util/state.js
@ -39,7 +39,7 @@ export class PageState
 // ============================================================================
 export class RedisCrawlState
 {
-  constructor(redis, key, pageTimeout, uid) {
+  constructor(redis, key, maxPageTime, uid) {
    this.redis = redis;

    this.maxRetryPending = 1;
@ -48,7 +48,7 @@ export class RedisCrawlState

    this.uid = uid;
    this.key = key;
-    this.pageTimeout = pageTimeout / 1000;
+    this.maxPageTime = maxPageTime;

    this.qkey = this.key + ":q";
    this.pkey = this.key + ":p";
@ -152,7 +152,7 @@ return 0;
  async markStarted(url) {
    const started = this._timestamp();

-    return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.pageTimeout);
+    return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime);
  }

  async markFinished(url) {
--- a/util/timing.js
+++ b/util/timing.js
@ -24,4 +24,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
    });
 }

+export function secondsElapsed(startTime, nowDate = null) {
+  nowDate = nowDate || new Date();

+  return (nowDate.getTime() - startTime) / 1000;
+}
--- a/util/worker.js
+++ b/util/worker.js
@ -6,14 +6,14 @@ const MAX_REUSE = 5;
 const NEW_WINDOW_TIMEOUT = 10;

 // ===========================================================================
-export function runWorkers(crawler, numWorkers, timeout) {
+export function runWorkers(crawler, numWorkers, maxPageTime) {
  logger.info(`Creating ${numWorkers} workers`, {}, "worker");

  const workers = [];

  for (let i = 0; i < numWorkers; i++) {
-    //workers.push(new PageWorker(`worker-${i+1}`, crawler, timeout));
-    workers.push(new PageWorker(i, crawler, timeout));
+    //workers.push(new PageWorker(`worker-${i+1}`, crawler, maxPageTime));
+    workers.push(new PageWorker(i, crawler, maxPageTime));
  }

  return Promise.allSettled(workers.map((worker) => worker.run()));
@ -23,10 +23,10 @@ export function runWorkers(crawler, numWorkers, timeout) {
 // ===========================================================================
 export class PageWorker
 {
-  constructor(id, crawler, timeout) {
+  constructor(id, crawler, maxPageTime) {
    this.id = id;
    this.crawler = crawler;
-    this.timeout = timeout;
+    this.maxPageTime = maxPageTime;

    this.reuseCount = 0;
    this.page = null;
@ -134,7 +134,7 @@ export class PageWorker
      await Promise.race([
        timedRun(
          this.crawler.crawlPage(opts),
-          this.timeout,
+          this.maxPageTime,
          "Page Worker Timeout",
          {workerid},
          "worker"