Support Extra Hops beyond current scope with --extraHops option (#98)

* extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83 * update README with info on `extraHops`, add tests for extraHops * dependency fix: use pywb 2.6.3, warcio 1.5.0 * bump to 0.5.0-beta.2
2025-10-19 06:23:16 +00:00 · 2022-01-15 09:03:09 -08:00 · 2022-01-15 09:03:09 -08:00 · 201eab4ad1
commit 201eab4ad1
parent 9f541ab011
10 changed files with 102 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -269,6 +269,16 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
 The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.


+#### Extra 'Hops' Beyond Current Scope
+
+Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
+
+For example, this is most useful when crawling with a `host` or `prefix` scope, but also wanting to include 'one extra hop' - any link to external pages beyond the current host, but not following those links. This is now possible with the `extraHops` setting, which defaults to 0, but can be set to a higher value N (usually 1) to go beyond the current scope.
+
+The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
+that will take precedence over the `--extraHops`.
+
+
 #### Scope Rule Examples

 For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
--- a/crawler.js
+++ b/crawler.js
@ -401,7 +401,7 @@ class Crawler {

    for (let i = 0; i < this.params.scopedSeeds.length; i++) {
      const seed = this.params.scopedSeeds[i];
-      if (!await this.queueUrl(i, seed.url, 0)) {
+      if (!await this.queueUrl(i, seed.url, 0, 0)) {
        if (this.limitHit) {
          break;
        }
@ -479,7 +479,7 @@ class Crawler {
  }

  async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
-    const {url, seedId, depth} = urlData;
+    const {url, seedId, depth, extraHops = 0} = urlData;

    if (!await this.isHTML(url)) {
      try {
@ -509,7 +509,7 @@ class Crawler {

    for (const opts of selectorOptsList) {
      const links = await this.extractLinks(page, opts);
-      await this.queueInScopeUrls(seedId, links, depth);
+      await this.queueInScopeUrls(seedId, links, depth, extraHops);
    }
  }

@ -544,16 +544,25 @@ class Crawler {
    return results;
  }

-  async queueInScopeUrls(seedId, urls, depth) {
+  async queueInScopeUrls(seedId, urls, depth, extraHops = 0) {
    try {
      depth += 1;
      const seed = this.params.scopedSeeds[seedId];

-      for (const url of urls) {
-        const captureUrl = seed.isIncluded(url, depth);
+      // new number of extra hops, set if this hop is out-of-scope (oos)
+      const newExtraHops = extraHops + 1;

-        if (captureUrl) {
-          await this.queueUrl(seedId, captureUrl, depth);
+      for (const possibleUrl of urls) {
+        const res = seed.isIncluded(possibleUrl, depth, newExtraHops);
+
+        if (!res) {
+          continue;
+        }
+
+        const {url, isOOS} = res;
+
+        if (url) {
+          await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops);
        }
      }
    } catch (e) {
@ -561,7 +570,7 @@ class Crawler {
    }
  }

-  async queueUrl(seedId, url, depth) {
+  async queueUrl(seedId, url, depth, extraHops = 0) {
    if (this.limitHit) {
      return false;
    }
@ -576,7 +585,11 @@ class Crawler {
    }

    await this.crawlState.add(url);
-    this.cluster.queue({url, seedId, depth});
+    const urlData = {url, seedId, depth};
+    if (extraHops) {
+      urlData.extraHops = extraHops;
+    }
+    this.cluster.queue(urlData);
    return true;
  }

--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.5.0-beta.1",
+  "version": "0.5.0-beta.2",
  "main": "browsertrix-crawler",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
  "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
@ -27,6 +27,6 @@
    "eslint-plugin-react": "^7.22.0",
    "jest": "^26.6.3",
    "md5": "^2.3.0",
-    "warcio": "^1.4.3"
+    "warcio": "^1.5.0"
  }
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-#pywb>=2.6.0
-git+https://github.com/webrecorder/pywb@twitter-rw
+pywb>=2.6.3
+#git+https://github.com/webrecorder/pywb@twitter-rw
 uwsgi
 wacz>=0.3.2
--- a/tests/extra_hops_depth.test.js
+++ b/tests/extra_hops_depth.test.js
@ -0,0 +1,34 @@
+const util = require("util");
+const exec = util.promisify(require("child_process").exec);
+const fs = require("fs");
+
+test("check that URLs are crawled 2 extra hops beyond depth", async () => {
+  jest.setTimeout(60000);
+
+  try {
+    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
+  }
+  catch (error) {
+    console.log(error);
+  }
+
+  const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
+
+  const expectedPages = [
+    "https://example.com/",
+    "https://www.iana.org/domains/example",
+    "http://www.iana.org/",
+    "http://www.iana.org/domains",
+    "http://www.iana.org/protocols",
+    "http://www.iana.org/numbers",
+    "http://www.iana.org/about",
+  ];
+
+  for (const page of crawled_pages.trim().split("\n")) {
+    const url = JSON.parse(page).url;
+    if (!url) {
+      continue;
+    }
+    expect(expectedPages.indexOf(url) >= 0).toBe(true);
+  }
+});
--- a/tests/url_file_list.test.js
+++ b/tests/url_file_list.test.js
@ -2,10 +2,10 @@ const util = require("util");
 const exec = util.promisify(require("child_process").exec);
 const fs = require("fs");

-test("check that all urls in a file list are crawled when the filelisturl param is passed", async () => {
+test("check that URLs one-depth out from the seed-list are crawled", async () => {
  jest.setTimeout(30000);

-  try{
+  try {

    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
  }
--- a/util/argParser.js
+++ b/util/argParser.js
@ -61,6 +61,12 @@ class ArgParser {
        type: "number",
      },

+      "extraHops": {
+        describe: "Number of extra 'hops' to follow, beyond the current scope",
+        default: 0,
+        type: "number"
+      },
+
      "limit": {
        describe: "Limit crawl to this number of pages",
        default: 0,
@ -366,6 +372,7 @@ class ArgParser {
      include: argv.include,
      exclude: argv.exclude,
      depth: argv.depth,
+      extraHops: argv.extraHops,
    };

    argv.scopedSeeds = [];
--- a/util/seeds.js
+++ b/util/seeds.js
@ -1,6 +1,6 @@
 class ScopedSeed
 {
-  constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
+  constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
    const parsedUrl = this.parseUrl(url);
    this.url = parsedUrl.href;
    this.include = this.parseRx(include);
@ -17,6 +17,7 @@ class ScopedSeed

    this.sitemap = this.resolveSiteMap(sitemap);
    this.allowHash = allowHash;
+    this.maxExtraHops = extraHops;
    this.maxDepth = depth < 0 ? 99999 : depth;
  }

@ -93,7 +94,7 @@ class ScopedSeed
    return depth >= this.maxDepth;
  }

-  isIncluded(url, depth) {
+  isIncluded(url, depth, extraHops = 0) {
    if (depth > this.maxDepth) {
      return false;
    }
@ -125,9 +126,15 @@ class ScopedSeed
      }
    }

+    let isOOS = false;
+
    if (!inScope) {
-      //console.log(`Not in scope ${url} ${this.include}`);
-      return false;
+      if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
+        isOOS = true;
+      } else {
+        //console.log(`Not in scope ${url} ${this.include}`);
+        return false;
+      }
    }

    // check exclusions
@ -138,7 +145,7 @@ class ScopedSeed
      }
    }

-    return url;
+    return {url, isOOS};
  }
 }

--- a/util/state.js
+++ b/util/state.js
@ -27,7 +27,7 @@ class BaseState
  recheckScope(data, seeds) {
    const seed = seeds[data.seedId];

-    return seed.isIncluded(data.url, data.depth);
+    return seed.isIncluded(data.url, data.depth, data.extraHops);
  }
 }

--- a/yarn.lock
+++ b/yarn.lock
@ -1152,9 +1152,9 @@ camelcase@^6.0.0:
  integrity sha512-c7wVvbw3f37nuobQNtgsgG9POC9qMbNuMQmTCqZv23b6MIz0fcYpBiOlv9gEN/hdLdnZTDQhg6e9Dq5M1vKvfg==

 caniuse-lite@^1.0.30001219:
-  version "1.0.30001228"
-  resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001228.tgz#bfdc5942cd3326fa51ee0b42fbef4da9d492a7fa"
-  integrity sha512-QQmLOGJ3DEgokHbMSA8cj2a+geXqmnpyOFT0lhQV6P3/YOJvGDEwoedcwxEQ30gJIwIIunHIicunJ2rzK5gB2A==
+  version "1.0.30001299"
+  resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001299.tgz"
+  integrity sha512-iujN4+x7QzqA2NCSrS5VUy+4gLmRd4xv6vbBBsmfVqTx8bLAD8097euLqQgKxSVLvxjSDcvF1T/i9ocgnUFexw==

 capture-exit@^2.0.0:
  version "2.0.0"
@ -3414,7 +3414,7 @@ nice-try@^1.0.4:
  resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366"
  integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==

-node-fetch@^2.6.0, node-fetch@^2.6.1:
+node-fetch@^2.6.1:
  version "2.6.1"
  resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
  integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
@ -4811,15 +4811,15 @@ walker@^1.0.7, walker@~1.0.5:
  dependencies:
    makeerror "1.0.x"

-warcio@^1.4.3:
-  version "1.4.5"
-  resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.5.tgz#24ca61f799185c5d88cdd0a65d279f376b4f9a63"
-  integrity sha512-VwFBdmEQhWHmxsdyiLM0INHD1KZ2+EGYzslZXFe6JdbuTfSF/dYRQ/wEdvp+m28mydphROF6D32KfkIMRU1NZw==
+warcio@^1.5.0:
+  version "1.5.0"
+  resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.5.0.tgz#da80805f36b26c68c3b79e9d1d334f8df746df3e"
+  integrity sha512-80X3IJ0L5OZYRI/5gIjrLzivP/GVWtWrWsNexvSkfeSafoMsXxViywAuotMh4+WzjrcgDA9SGR1Gpg/uXl/9Fw==
  dependencies:
    "@peculiar/webcrypto" "^1.1.1"
    esm "^3.2.25"
    hi-base32 "^0.5.0"
-    node-fetch "^2.6.0"
+    node-fetch "^2.6.1"
    pako "^1.0.11"
    uuid-random "^1.3.0"
    yargs "^15.3.1"