From 201eab4ad1d98e902b21c3b3775c9bd5d88d7d61 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 15 Jan 2022 09:03:09 -0800 Subject: [PATCH] Support Extra Hops beyond current scope with --extraHops option (#98) * extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83 * update README with info on `extraHops`, add tests for extraHops * dependency fix: use pywb 2.6.3, warcio 1.5.0 * bump to 0.5.0-beta.2 --- README.md | 10 ++++++++++ crawler.js | 33 +++++++++++++++++++++++---------- package.json | 4 ++-- requirements.txt | 4 ++-- tests/extra_hops_depth.test.js | 34 ++++++++++++++++++++++++++++++++++ tests/url_file_list.test.js | 4 ++-- util/argParser.js | 7 +++++++ util/seeds.js | 17 ++++++++++++----- util/state.js | 2 +- yarn.lock | 18 +++++++++--------- 10 files changed, 102 insertions(+), 31 deletions(-) create mode 100644 tests/extra_hops_depth.test.js diff --git a/README.md b/README.md index b594620e..dcaba9cb 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,16 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well. +#### Extra 'Hops' Beyond Current Scope + +Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope. + +For example, this is most useful when crawling with a `host` or `prefix` scope, but also wanting to include 'one extra hop' - any link to external pages beyond the current host, but not following those links. This is now possible with the `extraHops` setting, which defaults to 0, but can be set to a higher value N (usually 1) to go beyond the current scope. + +The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules, +that will take precedence over the `--extraHops`. + + #### Scope Rule Examples For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*` diff --git a/crawler.js b/crawler.js index 81366865..c5593e3e 100644 --- a/crawler.js +++ b/crawler.js @@ -401,7 +401,7 @@ class Crawler { for (let i = 0; i < this.params.scopedSeeds.length; i++) { const seed = this.params.scopedSeeds[i]; - if (!await this.queueUrl(i, seed.url, 0)) { + if (!await this.queueUrl(i, seed.url, 0, 0)) { if (this.limitHit) { break; } @@ -479,7 +479,7 @@ class Crawler { } async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) { - const {url, seedId, depth} = urlData; + const {url, seedId, depth, extraHops = 0} = urlData; if (!await this.isHTML(url)) { try { @@ -509,7 +509,7 @@ class Crawler { for (const opts of selectorOptsList) { const links = await this.extractLinks(page, opts); - await this.queueInScopeUrls(seedId, links, depth); + await this.queueInScopeUrls(seedId, links, depth, extraHops); } } @@ -544,16 +544,25 @@ class Crawler { return results; } - async queueInScopeUrls(seedId, urls, depth) { + async queueInScopeUrls(seedId, urls, depth, extraHops = 0) { try { depth += 1; const seed = this.params.scopedSeeds[seedId]; - for (const url of urls) { - const captureUrl = seed.isIncluded(url, depth); + // new number of extra hops, set if this hop is out-of-scope (oos) + const newExtraHops = extraHops + 1; - if (captureUrl) { - await this.queueUrl(seedId, captureUrl, depth); + for (const possibleUrl of urls) { + const res = seed.isIncluded(possibleUrl, depth, newExtraHops); + + if (!res) { + continue; + } + + const {url, isOOS} = res; + + if (url) { + await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops); } } } catch (e) { @@ -561,7 +570,7 @@ class Crawler { } } - async queueUrl(seedId, url, depth) { + async queueUrl(seedId, url, depth, extraHops = 0) { if (this.limitHit) { return false; } @@ -576,7 +585,11 @@ class Crawler { } await this.crawlState.add(url); - this.cluster.queue({url, seedId, depth}); + const urlData = {url, seedId, depth}; + if (extraHops) { + urlData.extraHops = extraHops; + } + this.cluster.queue(urlData); return true; } diff --git a/package.json b/package.json index 5ebd2c8e..de5bf534 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.5.0-beta.1", + "version": "0.5.0-beta.2", "main": "browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software", @@ -27,6 +27,6 @@ "eslint-plugin-react": "^7.22.0", "jest": "^26.6.3", "md5": "^2.3.0", - "warcio": "^1.4.3" + "warcio": "^1.5.0" } } diff --git a/requirements.txt b/requirements.txt index 5efa8dfe..fed48c7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -#pywb>=2.6.0 -git+https://github.com/webrecorder/pywb@twitter-rw +pywb>=2.6.3 +#git+https://github.com/webrecorder/pywb@twitter-rw uwsgi wacz>=0.3.2 diff --git a/tests/extra_hops_depth.test.js b/tests/extra_hops_depth.test.js new file mode 100644 index 00000000..e5d395fd --- /dev/null +++ b/tests/extra_hops_depth.test.js @@ -0,0 +1,34 @@ +const util = require("util"); +const exec = util.promisify(require("child_process").exec); +const fs = require("fs"); + +test("check that URLs are crawled 2 extra hops beyond depth", async () => { + jest.setTimeout(60000); + + try { + await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7"); + } + catch (error) { + console.log(error); + } + + const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8"); + + const expectedPages = [ + "https://example.com/", + "https://www.iana.org/domains/example", + "http://www.iana.org/", + "http://www.iana.org/domains", + "http://www.iana.org/protocols", + "http://www.iana.org/numbers", + "http://www.iana.org/about", + ]; + + for (const page of crawled_pages.trim().split("\n")) { + const url = JSON.parse(page).url; + if (!url) { + continue; + } + expect(expectedPages.indexOf(url) >= 0).toBe(true); + } +}); diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index d9c6a6ce..29e81af6 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -2,10 +2,10 @@ const util = require("util"); const exec = util.promisify(require("child_process").exec); const fs = require("fs"); -test("check that all urls in a file list are crawled when the filelisturl param is passed", async () => { +test("check that URLs one-depth out from the seed-list are crawled", async () => { jest.setTimeout(30000); - try{ + try { await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000"); } diff --git a/util/argParser.js b/util/argParser.js index d48c2c19..7364d720 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -61,6 +61,12 @@ class ArgParser { type: "number", }, + "extraHops": { + describe: "Number of extra 'hops' to follow, beyond the current scope", + default: 0, + type: "number" + }, + "limit": { describe: "Limit crawl to this number of pages", default: 0, @@ -366,6 +372,7 @@ class ArgParser { include: argv.include, exclude: argv.exclude, depth: argv.depth, + extraHops: argv.extraHops, }; argv.scopedSeeds = []; diff --git a/util/seeds.js b/util/seeds.js index 2994ffd3..f6446c6f 100644 --- a/util/seeds.js +++ b/util/seeds.js @@ -1,6 +1,6 @@ class ScopedSeed { - constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) { + constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) { const parsedUrl = this.parseUrl(url); this.url = parsedUrl.href; this.include = this.parseRx(include); @@ -17,6 +17,7 @@ class ScopedSeed this.sitemap = this.resolveSiteMap(sitemap); this.allowHash = allowHash; + this.maxExtraHops = extraHops; this.maxDepth = depth < 0 ? 99999 : depth; } @@ -93,7 +94,7 @@ class ScopedSeed return depth >= this.maxDepth; } - isIncluded(url, depth) { + isIncluded(url, depth, extraHops = 0) { if (depth > this.maxDepth) { return false; } @@ -125,9 +126,15 @@ class ScopedSeed } } + let isOOS = false; + if (!inScope) { - //console.log(`Not in scope ${url} ${this.include}`); - return false; + if (this.maxExtraHops && extraHops <= this.maxExtraHops) { + isOOS = true; + } else { + //console.log(`Not in scope ${url} ${this.include}`); + return false; + } } // check exclusions @@ -138,7 +145,7 @@ class ScopedSeed } } - return url; + return {url, isOOS}; } } diff --git a/util/state.js b/util/state.js index a174548b..ed03a46b 100644 --- a/util/state.js +++ b/util/state.js @@ -27,7 +27,7 @@ class BaseState recheckScope(data, seeds) { const seed = seeds[data.seedId]; - return seed.isIncluded(data.url, data.depth); + return seed.isIncluded(data.url, data.depth, data.extraHops); } } diff --git a/yarn.lock b/yarn.lock index 9b423fd2..5be1d5bd 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1152,9 +1152,9 @@ camelcase@^6.0.0: integrity sha512-c7wVvbw3f37nuobQNtgsgG9POC9qMbNuMQmTCqZv23b6MIz0fcYpBiOlv9gEN/hdLdnZTDQhg6e9Dq5M1vKvfg== caniuse-lite@^1.0.30001219: - version "1.0.30001228" - resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001228.tgz#bfdc5942cd3326fa51ee0b42fbef4da9d492a7fa" - integrity sha512-QQmLOGJ3DEgokHbMSA8cj2a+geXqmnpyOFT0lhQV6P3/YOJvGDEwoedcwxEQ30gJIwIIunHIicunJ2rzK5gB2A== + version "1.0.30001299" + resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001299.tgz" + integrity sha512-iujN4+x7QzqA2NCSrS5VUy+4gLmRd4xv6vbBBsmfVqTx8bLAD8097euLqQgKxSVLvxjSDcvF1T/i9ocgnUFexw== capture-exit@^2.0.0: version "2.0.0" @@ -3414,7 +3414,7 @@ nice-try@^1.0.4: resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366" integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ== -node-fetch@^2.6.0, node-fetch@^2.6.1: +node-fetch@^2.6.1: version "2.6.1" resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052" integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw== @@ -4811,15 +4811,15 @@ walker@^1.0.7, walker@~1.0.5: dependencies: makeerror "1.0.x" -warcio@^1.4.3: - version "1.4.5" - resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.5.tgz#24ca61f799185c5d88cdd0a65d279f376b4f9a63" - integrity sha512-VwFBdmEQhWHmxsdyiLM0INHD1KZ2+EGYzslZXFe6JdbuTfSF/dYRQ/wEdvp+m28mydphROF6D32KfkIMRU1NZw== +warcio@^1.5.0: + version "1.5.0" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.5.0.tgz#da80805f36b26c68c3b79e9d1d334f8df746df3e" + integrity sha512-80X3IJ0L5OZYRI/5gIjrLzivP/GVWtWrWsNexvSkfeSafoMsXxViywAuotMh4+WzjrcgDA9SGR1Gpg/uXl/9Fw== dependencies: "@peculiar/webcrypto" "^1.1.1" esm "^3.2.25" hi-base32 "^0.5.0" - node-fetch "^2.6.0" + node-fetch "^2.6.1" pako "^1.0.11" uuid-random "^1.3.0" yargs "^15.3.1"