From 201eab4ad1d98e902b21c3b3775c9bd5d88d7d61 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Sat, 15 Jan 2022 09:03:09 -0800
Subject: [PATCH] Support Extra Hops beyond current scope with --extraHops
 option (#98)

* extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83

* update README with info on `extraHops`, add tests for extraHops

* dependency fix: use pywb 2.6.3, warcio 1.5.0

* bump to 0.5.0-beta.2
---
 README.md                      | 10 ++++++++++
 crawler.js                     | 33 +++++++++++++++++++++++----------
 package.json                   |  4 ++--
 requirements.txt               |  4 ++--
 tests/extra_hops_depth.test.js | 34 ++++++++++++++++++++++++++++++++++
 tests/url_file_list.test.js    |  4 ++--
 util/argParser.js              |  7 +++++++
 util/seeds.js                  | 17 ++++++++++++-----
 util/state.js                  |  2 +-
 yarn.lock                      | 18 +++++++++---------
 10 files changed, 102 insertions(+), 31 deletions(-)
 create mode 100644 tests/extra_hops_depth.test.js

diff --git a/README.md b/README.md
index b594620e..dcaba9cb 100644
--- a/README.md
+++ b/README.md
@@ -269,6 +269,16 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
 The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
 
 
+#### Extra 'Hops' Beyond Current Scope
+
+Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
+
+For example, this is most useful when crawling with a `host` or `prefix` scope, but also wanting to include 'one extra hop' - any link to external pages beyond the current host, but not following those links. This is now possible with the `extraHops` setting, which defaults to 0, but can be set to a higher value N (usually 1) to go beyond the current scope.
+
+The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
+that will take precedence over the `--extraHops`.
+
+
 #### Scope Rule Examples
 
 For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
diff --git a/crawler.js b/crawler.js
index 81366865..c5593e3e 100644
--- a/crawler.js
+++ b/crawler.js
@@ -401,7 +401,7 @@ class Crawler {
 
     for (let i = 0; i < this.params.scopedSeeds.length; i++) {
       const seed = this.params.scopedSeeds[i];
-      if (!await this.queueUrl(i, seed.url, 0)) {
+      if (!await this.queueUrl(i, seed.url, 0, 0)) {
         if (this.limitHit) {
           break;
         }
@@ -479,7 +479,7 @@ class Crawler {
   }
 
   async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
-    const {url, seedId, depth} = urlData;
+    const {url, seedId, depth, extraHops = 0} = urlData;
 
     if (!await this.isHTML(url)) {
       try {
@@ -509,7 +509,7 @@ class Crawler {
 
     for (const opts of selectorOptsList) {
       const links = await this.extractLinks(page, opts);
-      await this.queueInScopeUrls(seedId, links, depth);
+      await this.queueInScopeUrls(seedId, links, depth, extraHops);
     }
   }
 
@@ -544,16 +544,25 @@ class Crawler {
     return results;
   }
 
-  async queueInScopeUrls(seedId, urls, depth) {
+  async queueInScopeUrls(seedId, urls, depth, extraHops = 0) {
     try {
       depth += 1;
       const seed = this.params.scopedSeeds[seedId];
 
-      for (const url of urls) {
-        const captureUrl = seed.isIncluded(url, depth);
+      // new number of extra hops, set if this hop is out-of-scope (oos)
+      const newExtraHops = extraHops + 1;
 
-        if (captureUrl) {
-          await this.queueUrl(seedId, captureUrl, depth);
+      for (const possibleUrl of urls) {
+        const res = seed.isIncluded(possibleUrl, depth, newExtraHops);
+
+        if (!res) {
+          continue;
+        }
+
+        const {url, isOOS} = res;
+
+        if (url) {
+          await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops);
         }
       }
     } catch (e) {
@@ -561,7 +570,7 @@ class Crawler {
     }
   }
 
-  async queueUrl(seedId, url, depth) {
+  async queueUrl(seedId, url, depth, extraHops = 0) {
     if (this.limitHit) {
       return false;
     }
@@ -576,7 +585,11 @@ class Crawler {
     }
 
     await this.crawlState.add(url);
-    this.cluster.queue({url, seedId, depth});
+    const urlData = {url, seedId, depth};
+    if (extraHops) {
+      urlData.extraHops = extraHops;
+    }
+    this.cluster.queue(urlData);
     return true;
   }
 
diff --git a/package.json b/package.json
index 5ebd2c8e..de5bf534 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "browsertrix-crawler",
-  "version": "0.5.0-beta.1",
+  "version": "0.5.0-beta.2",
   "main": "browsertrix-crawler",
   "repository": "https://github.com/webrecorder/browsertrix-crawler",
   "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
@@ -27,6 +27,6 @@
     "eslint-plugin-react": "^7.22.0",
     "jest": "^26.6.3",
     "md5": "^2.3.0",
-    "warcio": "^1.4.3"
+    "warcio": "^1.5.0"
   }
 }
diff --git a/requirements.txt b/requirements.txt
index 5efa8dfe..fed48c7e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-#pywb>=2.6.0
-git+https://github.com/webrecorder/pywb@twitter-rw
+pywb>=2.6.3
+#git+https://github.com/webrecorder/pywb@twitter-rw
 uwsgi
 wacz>=0.3.2
diff --git a/tests/extra_hops_depth.test.js b/tests/extra_hops_depth.test.js
new file mode 100644
index 00000000..e5d395fd
--- /dev/null
+++ b/tests/extra_hops_depth.test.js
@@ -0,0 +1,34 @@
+const util = require("util");
+const exec = util.promisify(require("child_process").exec);
+const fs = require("fs");
+
+test("check that URLs are crawled 2 extra hops beyond depth", async () => {
+  jest.setTimeout(60000);
+
+  try {
+    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
+  }
+  catch (error) {
+    console.log(error);
+  }
+
+  const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
+
+  const expectedPages = [
+    "https://example.com/",
+    "https://www.iana.org/domains/example",
+    "http://www.iana.org/",
+    "http://www.iana.org/domains",
+    "http://www.iana.org/protocols",
+    "http://www.iana.org/numbers",
+    "http://www.iana.org/about",
+  ];
+
+  for (const page of crawled_pages.trim().split("\n")) {
+    const url = JSON.parse(page).url;
+    if (!url) {
+      continue;
+    }
+    expect(expectedPages.indexOf(url) >= 0).toBe(true);
+  }
+});
diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js
index d9c6a6ce..29e81af6 100644
--- a/tests/url_file_list.test.js
+++ b/tests/url_file_list.test.js
@@ -2,10 +2,10 @@ const util = require("util");
 const exec = util.promisify(require("child_process").exec);
 const fs = require("fs");
 
-test("check that all urls in a file list are crawled when the filelisturl param is passed", async () => {
+test("check that URLs one-depth out from the seed-list are crawled", async () => {
   jest.setTimeout(30000);
 
-  try{
+  try {
 
     await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
   }
diff --git a/util/argParser.js b/util/argParser.js
index d48c2c19..7364d720 100644
--- a/util/argParser.js
+++ b/util/argParser.js
@@ -61,6 +61,12 @@ class ArgParser {
         type: "number",
       },
 
+      "extraHops": {
+        describe: "Number of extra 'hops' to follow, beyond the current scope",
+        default: 0,
+        type: "number"
+      },
+
       "limit": {
         describe: "Limit crawl to this number of pages",
         default: 0,
@@ -366,6 +372,7 @@ class ArgParser {
       include: argv.include,
       exclude: argv.exclude,
       depth: argv.depth,
+      extraHops: argv.extraHops,
     };
 
     argv.scopedSeeds = [];
diff --git a/util/seeds.js b/util/seeds.js
index 2994ffd3..f6446c6f 100644
--- a/util/seeds.js
+++ b/util/seeds.js
@@ -1,6 +1,6 @@
 class ScopedSeed
 {
-  constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
+  constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
     const parsedUrl = this.parseUrl(url);
     this.url = parsedUrl.href;
     this.include = this.parseRx(include);
@@ -17,6 +17,7 @@ class ScopedSeed
 
     this.sitemap = this.resolveSiteMap(sitemap);
     this.allowHash = allowHash;
+    this.maxExtraHops = extraHops;
     this.maxDepth = depth < 0 ? 99999 : depth;
   }
 
@@ -93,7 +94,7 @@ class ScopedSeed
     return depth >= this.maxDepth;
   }
 
-  isIncluded(url, depth) {
+  isIncluded(url, depth, extraHops = 0) {
     if (depth > this.maxDepth) {
       return false;
     }
@@ -125,9 +126,15 @@ class ScopedSeed
       }
     }
 
+    let isOOS = false;
+
     if (!inScope) {
-      //console.log(`Not in scope ${url} ${this.include}`);
-      return false;
+      if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
+        isOOS = true;
+      } else {
+        //console.log(`Not in scope ${url} ${this.include}`);
+        return false;
+      }
     }
 
     // check exclusions
@@ -138,7 +145,7 @@ class ScopedSeed
       }
     }
 
-    return url;
+    return {url, isOOS};
   }
 }
 
diff --git a/util/state.js b/util/state.js
index a174548b..ed03a46b 100644
--- a/util/state.js
+++ b/util/state.js
@@ -27,7 +27,7 @@ class BaseState
   recheckScope(data, seeds) {
     const seed = seeds[data.seedId];
 
-    return seed.isIncluded(data.url, data.depth);
+    return seed.isIncluded(data.url, data.depth, data.extraHops);
   }
 }
 
diff --git a/yarn.lock b/yarn.lock
index 9b423fd2..5be1d5bd 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1152,9 +1152,9 @@ camelcase@^6.0.0:
   integrity sha512-c7wVvbw3f37nuobQNtgsgG9POC9qMbNuMQmTCqZv23b6MIz0fcYpBiOlv9gEN/hdLdnZTDQhg6e9Dq5M1vKvfg==
 
 caniuse-lite@^1.0.30001219:
-  version "1.0.30001228"
-  resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001228.tgz#bfdc5942cd3326fa51ee0b42fbef4da9d492a7fa"
-  integrity sha512-QQmLOGJ3DEgokHbMSA8cj2a+geXqmnpyOFT0lhQV6P3/YOJvGDEwoedcwxEQ30gJIwIIunHIicunJ2rzK5gB2A==
+  version "1.0.30001299"
+  resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001299.tgz"
+  integrity sha512-iujN4+x7QzqA2NCSrS5VUy+4gLmRd4xv6vbBBsmfVqTx8bLAD8097euLqQgKxSVLvxjSDcvF1T/i9ocgnUFexw==
 
 capture-exit@^2.0.0:
   version "2.0.0"
@@ -3414,7 +3414,7 @@ nice-try@^1.0.4:
   resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366"
   integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==
 
-node-fetch@^2.6.0, node-fetch@^2.6.1:
+node-fetch@^2.6.1:
   version "2.6.1"
   resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
   integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
@@ -4811,15 +4811,15 @@ walker@^1.0.7, walker@~1.0.5:
   dependencies:
     makeerror "1.0.x"
 
-warcio@^1.4.3:
-  version "1.4.5"
-  resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.5.tgz#24ca61f799185c5d88cdd0a65d279f376b4f9a63"
-  integrity sha512-VwFBdmEQhWHmxsdyiLM0INHD1KZ2+EGYzslZXFe6JdbuTfSF/dYRQ/wEdvp+m28mydphROF6D32KfkIMRU1NZw==
+warcio@^1.5.0:
+  version "1.5.0"
+  resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.5.0.tgz#da80805f36b26c68c3b79e9d1d334f8df746df3e"
+  integrity sha512-80X3IJ0L5OZYRI/5gIjrLzivP/GVWtWrWsNexvSkfeSafoMsXxViywAuotMh4+WzjrcgDA9SGR1Gpg/uXl/9Fw==
   dependencies:
     "@peculiar/webcrypto" "^1.1.1"
     esm "^3.2.25"
     hi-base32 "^0.5.0"
-    node-fetch "^2.6.0"
+    node-fetch "^2.6.1"
     pako "^1.0.11"
     uuid-random "^1.3.0"
     yargs "^15.3.1"