link extraction optimization: for scopeType page, set depth == extraHops to avoid getting links (#364)

if we know no additional links wil be used
2025-10-19 06:23:16 +00:00 · 2023-08-31 13:42:14 -07:00 · 2023-08-31 13:42:14 -07:00 · 3c2f5f8934
commit 3c2f5f8934
parent cf404efa13
3 changed files with 8 additions and 1 deletions
--- a/crawler.js
+++ b/crawler.js
@ -1142,6 +1142,7 @@ self.__bx_behaviors.selectMainBehavior();

    // skip extraction if at max depth
    if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
+      logger.debug("Skipping Link Extraction, At Max Depth");
      return;
    }

--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.10.4",
+  "version": "0.10.5",
  "main": "browsertrix-crawler",
  "type": "module",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
--- a/util/seeds.js
+++ b/util/seeds.js
@ -23,6 +23,12 @@ export class ScopedSeed
      this.include = [...include, ...this.include];
    }

+    // for page scope, the depth is set to extraHops, as no other
+    // crawling is done
+    if (this.scopeType === "page") {
+      depth = extraHops;
+    }
+
    this.sitemap = this.resolveSiteMap(sitemap);
    this.allowHash = allowHash;
    this.maxExtraHops = extraHops;