Exclusion Optimizations: follow-up to (#423)

Follow-up to #408 - optimized exclusion filtering: - use zscan with default count instead of ordered scan to remvoe - use glob match when possible (non-regex as determined by string check) - move isInScope() check to worker to avoid creating a page and then closing for every excluded URL - tests: update saved-state test to be more resilient to delays args: also support '--text false' for backwards compatibility, fixes webrecorder/browsertrix-cloud#1334 bump to 0.12.1
2025-12-08 06:09:48 +00:00 · 2023-11-03 15:15:09 -07:00 · 2023-11-03 15:15:09 -07:00 · dd7b926d87
commit dd7b926d87
parent 15661eb9c8
6 changed files with 61 additions and 20 deletions
--- a/crawler.js
+++ b/crawler.js
@ -481,11 +481,6 @@ self.__bx_behaviors.selectMainBehavior();
    data.logDetails = logDetails;
    data.workerid = workerid;
    if (!this.isInScope(data, logDetails)) {
      logger.info("Page no longer in scope", data);
      return true;
    }
    // run custom driver here
    await this.driver({page, data, crawler: this});
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.12.0",
+  "version": "0.12.1",
  "main": "browsertrix-crawler",
  "type": "module",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
--- a/tests/saved-state.test.js
+++ b/tests/saved-state.test.js
@ -93,8 +93,7 @@ test("check crawl restarted with saved state", async () => {
  try {
    proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
-  }
+  } catch (error) {
  catch (error) {
    console.log(error);
  }
@ -102,11 +101,22 @@ test("check crawl restarted with saved state", async () => {
  redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
-  await redis.connect({maxRetriesPerRequest: 50});
+  try {
    await redis.connect({
      maxRetriesPerRequest: 100,
      retryStrategy(times) {
        return times < 100 ? 1000 : null;
      }
    });
    await new Promise((resolve) => setTimeout(resolve, 2000));
    expect(await redis.get("test:d")).toBe(numDone + "");
-
+  } catch (e) {
    console.log(e);
  } finally {
    proc.kill("SIGINT");
  }
  finishProcess = wait.p;
 });
--- a/util/argParser.js
+++ b/util/argParser.js
@ -207,6 +207,9 @@ class ArgParser {
          if (!array.length || (array.length === 1 && array[0] === "true")) {
            return ["to-pages"];
          }
          if (array.length === 1 && array[0] === "false") {
            return [];
          }
          return coerce(array);
        }
      },
--- a/util/state.js
+++ b/util/state.js
@ -190,7 +190,7 @@ return 0;
  }
  async markFinished(url) {
-    await this.redis.call("hdel", this.pkey, url);
+    await this.redis.hdel(this.pkey, url);
    return await this.redis.incr(this.dkey);
  }
@ -201,6 +201,12 @@ return 0;
    return await this.redis.incr(this.dkey);
  }
  async markExcluded(url) {
    await this.redis.hdel(this.pkey, url);
    await this.redis.srem(this.skey, url);
  }
  recheckScope(data, seeds) {
    const seed = seeds[data.seedId];
@ -262,7 +268,7 @@ return 0;
          for (const seed of seeds) {
            seed.addExclusion(regex);
          }
-          // can happen async w/o slowing down scrolling
+          // can happen async w/o slowing down crawling
          // each page is still checked if in scope before crawling, even while
          // queue is being filtered
          this.filterQueue(regex);
@ -284,25 +290,45 @@ return 0;
    }
  }
-  async filterQueue(regexStr) {
+  isStrMatch(s) {
    // if matches original string, then consider not a regex
    return s.replace(/\\/g, "").replace(/[\\^$*+?.()|[\]{}]/g, "\\$&") === s;
  }
  filterQueue(regexStr) {
    const regex = new RegExp(regexStr);
-    const qsize = await this.redis.zcard(this.qkey);
+    let matcher = undefined;
-    const count = 50;
+    // regexStr just a string, optimize by using glob matching
    if (this.isStrMatch(regexStr)) {
      matcher = {"match": `*${regexStr}*`};
    }
-    for (let i = 0; i < qsize; i += count) {
+    const stream = this.redis.zscanStream(this.qkey, matcher);
-      const results = await this.redis.zrangebyscore(this.qkey, 0, "inf", "limit", i, count);
+
    stream.on("data", async (results) => {
      stream.pause();
      for (const result of results) {
        const { url } = JSON.parse(result);
        if (regex.test(url)) {
          const removed = await this.redis.zrem(this.qkey, result);
-          await this.redis.srem(this.skey, url);
+          //if (removed) {
          await this.markExcluded(url);
          //}
          logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
        }
      }
-    }
+
      stream.resume();
    });
    return new Promise(resolve => {
      stream.on("end", () => {
        resolve();
      });
    });
  }
  async incFailCount() {
--- a/util/worker.js
+++ b/util/worker.js
@ -243,6 +243,13 @@ export class PageWorker
      // see if any work data in the queue
      if (data) {
        // filter out any out-of-scope pages right away
        if (!this.crawler.isInScope(data, this.logDetails)) {
          logger.info("Page no longer in scope", data);
          await crawlState.markExcluded(data.url);
          continue;
        }
        // init page (new or reuse)
        const opts = await this.initPage(data.url);