Support Extra Hops beyond current scope with --extraHops option (#98)

* extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83

* update README with info on `extraHops`, add tests for extraHops

* dependency fix: use pywb 2.6.3, warcio 1.5.0

* bump to 0.5.0-beta.2
This commit is contained in:
Ilya Kreymer 2022-01-15 09:03:09 -08:00 committed by GitHub
parent 9f541ab011
commit 201eab4ad1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 102 additions and 31 deletions

View file

@ -269,6 +269,16 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
#### Extra 'Hops' Beyond Current Scope
Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
For example, this is most useful when crawling with a `host` or `prefix` scope, but also wanting to include 'one extra hop' - any link to external pages beyond the current host, but not following those links. This is now possible with the `extraHops` setting, which defaults to 0, but can be set to a higher value N (usually 1) to go beyond the current scope.
The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
that will take precedence over the `--extraHops`.
#### Scope Rule Examples
For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`

View file

@ -401,7 +401,7 @@ class Crawler {
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
const seed = this.params.scopedSeeds[i];
if (!await this.queueUrl(i, seed.url, 0)) {
if (!await this.queueUrl(i, seed.url, 0, 0)) {
if (this.limitHit) {
break;
}
@ -479,7 +479,7 @@ class Crawler {
}
async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
const {url, seedId, depth} = urlData;
const {url, seedId, depth, extraHops = 0} = urlData;
if (!await this.isHTML(url)) {
try {
@ -509,7 +509,7 @@ class Crawler {
for (const opts of selectorOptsList) {
const links = await this.extractLinks(page, opts);
await this.queueInScopeUrls(seedId, links, depth);
await this.queueInScopeUrls(seedId, links, depth, extraHops);
}
}
@ -544,16 +544,25 @@ class Crawler {
return results;
}
async queueInScopeUrls(seedId, urls, depth) {
async queueInScopeUrls(seedId, urls, depth, extraHops = 0) {
try {
depth += 1;
const seed = this.params.scopedSeeds[seedId];
for (const url of urls) {
const captureUrl = seed.isIncluded(url, depth);
// new number of extra hops, set if this hop is out-of-scope (oos)
const newExtraHops = extraHops + 1;
if (captureUrl) {
await this.queueUrl(seedId, captureUrl, depth);
for (const possibleUrl of urls) {
const res = seed.isIncluded(possibleUrl, depth, newExtraHops);
if (!res) {
continue;
}
const {url, isOOS} = res;
if (url) {
await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops);
}
}
} catch (e) {
@ -561,7 +570,7 @@ class Crawler {
}
}
async queueUrl(seedId, url, depth) {
async queueUrl(seedId, url, depth, extraHops = 0) {
if (this.limitHit) {
return false;
}
@ -576,7 +585,11 @@ class Crawler {
}
await this.crawlState.add(url);
this.cluster.queue({url, seedId, depth});
const urlData = {url, seedId, depth};
if (extraHops) {
urlData.extraHops = extraHops;
}
this.cluster.queue(urlData);
return true;
}

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.5.0-beta.1",
"version": "0.5.0-beta.2",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
@ -27,6 +27,6 @@
"eslint-plugin-react": "^7.22.0",
"jest": "^26.6.3",
"md5": "^2.3.0",
"warcio": "^1.4.3"
"warcio": "^1.5.0"
}
}

View file

@ -1,4 +1,4 @@
#pywb>=2.6.0
git+https://github.com/webrecorder/pywb@twitter-rw
pywb>=2.6.3
#git+https://github.com/webrecorder/pywb@twitter-rw
uwsgi
wacz>=0.3.2

View file

@ -0,0 +1,34 @@
const util = require("util");
const exec = util.promisify(require("child_process").exec);
const fs = require("fs");
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
jest.setTimeout(60000);
try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
}
catch (error) {
console.log(error);
}
const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
const expectedPages = [
"https://example.com/",
"https://www.iana.org/domains/example",
"http://www.iana.org/",
"http://www.iana.org/domains",
"http://www.iana.org/protocols",
"http://www.iana.org/numbers",
"http://www.iana.org/about",
];
for (const page of crawled_pages.trim().split("\n")) {
const url = JSON.parse(page).url;
if (!url) {
continue;
}
expect(expectedPages.indexOf(url) >= 0).toBe(true);
}
});

View file

@ -2,10 +2,10 @@ const util = require("util");
const exec = util.promisify(require("child_process").exec);
const fs = require("fs");
test("check that all urls in a file list are crawled when the filelisturl param is passed", async () => {
test("check that URLs one-depth out from the seed-list are crawled", async () => {
jest.setTimeout(30000);
try{
try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
}

View file

@ -61,6 +61,12 @@ class ArgParser {
type: "number",
},
"extraHops": {
describe: "Number of extra 'hops' to follow, beyond the current scope",
default: 0,
type: "number"
},
"limit": {
describe: "Limit crawl to this number of pages",
default: 0,
@ -366,6 +372,7 @@ class ArgParser {
include: argv.include,
exclude: argv.exclude,
depth: argv.depth,
extraHops: argv.extraHops,
};
argv.scopedSeeds = [];

View file

@ -1,6 +1,6 @@
class ScopedSeed
{
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
const parsedUrl = this.parseUrl(url);
this.url = parsedUrl.href;
this.include = this.parseRx(include);
@ -17,6 +17,7 @@ class ScopedSeed
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;
this.maxDepth = depth < 0 ? 99999 : depth;
}
@ -93,7 +94,7 @@ class ScopedSeed
return depth >= this.maxDepth;
}
isIncluded(url, depth) {
isIncluded(url, depth, extraHops = 0) {
if (depth > this.maxDepth) {
return false;
}
@ -125,9 +126,15 @@ class ScopedSeed
}
}
let isOOS = false;
if (!inScope) {
//console.log(`Not in scope ${url} ${this.include}`);
return false;
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
isOOS = true;
} else {
//console.log(`Not in scope ${url} ${this.include}`);
return false;
}
}
// check exclusions
@ -138,7 +145,7 @@ class ScopedSeed
}
}
return url;
return {url, isOOS};
}
}

View file

@ -27,7 +27,7 @@ class BaseState
recheckScope(data, seeds) {
const seed = seeds[data.seedId];
return seed.isIncluded(data.url, data.depth);
return seed.isIncluded(data.url, data.depth, data.extraHops);
}
}

View file

@ -1152,9 +1152,9 @@ camelcase@^6.0.0:
integrity sha512-c7wVvbw3f37nuobQNtgsgG9POC9qMbNuMQmTCqZv23b6MIz0fcYpBiOlv9gEN/hdLdnZTDQhg6e9Dq5M1vKvfg==
caniuse-lite@^1.0.30001219:
version "1.0.30001228"
resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001228.tgz#bfdc5942cd3326fa51ee0b42fbef4da9d492a7fa"
integrity sha512-QQmLOGJ3DEgokHbMSA8cj2a+geXqmnpyOFT0lhQV6P3/YOJvGDEwoedcwxEQ30gJIwIIunHIicunJ2rzK5gB2A==
version "1.0.30001299"
resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001299.tgz"
integrity sha512-iujN4+x7QzqA2NCSrS5VUy+4gLmRd4xv6vbBBsmfVqTx8bLAD8097euLqQgKxSVLvxjSDcvF1T/i9ocgnUFexw==
capture-exit@^2.0.0:
version "2.0.0"
@ -3414,7 +3414,7 @@ nice-try@^1.0.4:
resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366"
integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==
node-fetch@^2.6.0, node-fetch@^2.6.1:
node-fetch@^2.6.1:
version "2.6.1"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
@ -4811,15 +4811,15 @@ walker@^1.0.7, walker@~1.0.5:
dependencies:
makeerror "1.0.x"
warcio@^1.4.3:
version "1.4.5"
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.5.tgz#24ca61f799185c5d88cdd0a65d279f376b4f9a63"
integrity sha512-VwFBdmEQhWHmxsdyiLM0INHD1KZ2+EGYzslZXFe6JdbuTfSF/dYRQ/wEdvp+m28mydphROF6D32KfkIMRU1NZw==
warcio@^1.5.0:
version "1.5.0"
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.5.0.tgz#da80805f36b26c68c3b79e9d1d334f8df746df3e"
integrity sha512-80X3IJ0L5OZYRI/5gIjrLzivP/GVWtWrWsNexvSkfeSafoMsXxViywAuotMh4+WzjrcgDA9SGR1Gpg/uXl/9Fw==
dependencies:
"@peculiar/webcrypto" "^1.1.1"
esm "^3.2.25"
hi-base32 "^0.5.0"
node-fetch "^2.6.0"
node-fetch "^2.6.1"
pako "^1.0.11"
uuid-random "^1.3.0"
yargs "^15.3.1"