mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Support Extra Hops beyond current scope with --extraHops option (#98)
* extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83 * update README with info on `extraHops`, add tests for extraHops * dependency fix: use pywb 2.6.3, warcio 1.5.0 * bump to 0.5.0-beta.2
This commit is contained in:
parent
9f541ab011
commit
201eab4ad1
10 changed files with 102 additions and 31 deletions
10
README.md
10
README.md
|
@ -269,6 +269,16 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
|
|||
The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
|
||||
|
||||
|
||||
#### Extra 'Hops' Beyond Current Scope
|
||||
|
||||
Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
|
||||
|
||||
For example, this is most useful when crawling with a `host` or `prefix` scope, but also wanting to include 'one extra hop' - any link to external pages beyond the current host, but not following those links. This is now possible with the `extraHops` setting, which defaults to 0, but can be set to a higher value N (usually 1) to go beyond the current scope.
|
||||
|
||||
The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
|
||||
that will take precedence over the `--extraHops`.
|
||||
|
||||
|
||||
#### Scope Rule Examples
|
||||
|
||||
For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
|
||||
|
|
33
crawler.js
33
crawler.js
|
@ -401,7 +401,7 @@ class Crawler {
|
|||
|
||||
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
||||
const seed = this.params.scopedSeeds[i];
|
||||
if (!await this.queueUrl(i, seed.url, 0)) {
|
||||
if (!await this.queueUrl(i, seed.url, 0, 0)) {
|
||||
if (this.limitHit) {
|
||||
break;
|
||||
}
|
||||
|
@ -479,7 +479,7 @@ class Crawler {
|
|||
}
|
||||
|
||||
async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
|
||||
const {url, seedId, depth} = urlData;
|
||||
const {url, seedId, depth, extraHops = 0} = urlData;
|
||||
|
||||
if (!await this.isHTML(url)) {
|
||||
try {
|
||||
|
@ -509,7 +509,7 @@ class Crawler {
|
|||
|
||||
for (const opts of selectorOptsList) {
|
||||
const links = await this.extractLinks(page, opts);
|
||||
await this.queueInScopeUrls(seedId, links, depth);
|
||||
await this.queueInScopeUrls(seedId, links, depth, extraHops);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -544,16 +544,25 @@ class Crawler {
|
|||
return results;
|
||||
}
|
||||
|
||||
async queueInScopeUrls(seedId, urls, depth) {
|
||||
async queueInScopeUrls(seedId, urls, depth, extraHops = 0) {
|
||||
try {
|
||||
depth += 1;
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
||||
for (const url of urls) {
|
||||
const captureUrl = seed.isIncluded(url, depth);
|
||||
// new number of extra hops, set if this hop is out-of-scope (oos)
|
||||
const newExtraHops = extraHops + 1;
|
||||
|
||||
if (captureUrl) {
|
||||
await this.queueUrl(seedId, captureUrl, depth);
|
||||
for (const possibleUrl of urls) {
|
||||
const res = seed.isIncluded(possibleUrl, depth, newExtraHops);
|
||||
|
||||
if (!res) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const {url, isOOS} = res;
|
||||
|
||||
if (url) {
|
||||
await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
|
@ -561,7 +570,7 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async queueUrl(seedId, url, depth) {
|
||||
async queueUrl(seedId, url, depth, extraHops = 0) {
|
||||
if (this.limitHit) {
|
||||
return false;
|
||||
}
|
||||
|
@ -576,7 +585,11 @@ class Crawler {
|
|||
}
|
||||
|
||||
await this.crawlState.add(url);
|
||||
this.cluster.queue({url, seedId, depth});
|
||||
const urlData = {url, seedId, depth};
|
||||
if (extraHops) {
|
||||
urlData.extraHops = extraHops;
|
||||
}
|
||||
this.cluster.queue(urlData);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.5.0-beta.1",
|
||||
"version": "0.5.0-beta.2",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
@ -27,6 +27,6 @@
|
|||
"eslint-plugin-react": "^7.22.0",
|
||||
"jest": "^26.6.3",
|
||||
"md5": "^2.3.0",
|
||||
"warcio": "^1.4.3"
|
||||
"warcio": "^1.5.0"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#pywb>=2.6.0
|
||||
git+https://github.com/webrecorder/pywb@twitter-rw
|
||||
pywb>=2.6.3
|
||||
#git+https://github.com/webrecorder/pywb@twitter-rw
|
||||
uwsgi
|
||||
wacz>=0.3.2
|
||||
|
|
34
tests/extra_hops_depth.test.js
Normal file
34
tests/extra_hops_depth.test.js
Normal file
|
@ -0,0 +1,34 @@
|
|||
const util = require("util");
|
||||
const exec = util.promisify(require("child_process").exec);
|
||||
const fs = require("fs");
|
||||
|
||||
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
||||
jest.setTimeout(60000);
|
||||
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
|
||||
|
||||
const expectedPages = [
|
||||
"https://example.com/",
|
||||
"https://www.iana.org/domains/example",
|
||||
"http://www.iana.org/",
|
||||
"http://www.iana.org/domains",
|
||||
"http://www.iana.org/protocols",
|
||||
"http://www.iana.org/numbers",
|
||||
"http://www.iana.org/about",
|
||||
];
|
||||
|
||||
for (const page of crawled_pages.trim().split("\n")) {
|
||||
const url = JSON.parse(page).url;
|
||||
if (!url) {
|
||||
continue;
|
||||
}
|
||||
expect(expectedPages.indexOf(url) >= 0).toBe(true);
|
||||
}
|
||||
});
|
|
@ -2,10 +2,10 @@ const util = require("util");
|
|||
const exec = util.promisify(require("child_process").exec);
|
||||
const fs = require("fs");
|
||||
|
||||
test("check that all urls in a file list are crawled when the filelisturl param is passed", async () => {
|
||||
test("check that URLs one-depth out from the seed-list are crawled", async () => {
|
||||
jest.setTimeout(30000);
|
||||
|
||||
try{
|
||||
try {
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
|
||||
}
|
||||
|
|
|
@ -61,6 +61,12 @@ class ArgParser {
|
|||
type: "number",
|
||||
},
|
||||
|
||||
"extraHops": {
|
||||
describe: "Number of extra 'hops' to follow, beyond the current scope",
|
||||
default: 0,
|
||||
type: "number"
|
||||
},
|
||||
|
||||
"limit": {
|
||||
describe: "Limit crawl to this number of pages",
|
||||
default: 0,
|
||||
|
@ -366,6 +372,7 @@ class ArgParser {
|
|||
include: argv.include,
|
||||
exclude: argv.exclude,
|
||||
depth: argv.depth,
|
||||
extraHops: argv.extraHops,
|
||||
};
|
||||
|
||||
argv.scopedSeeds = [];
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
class ScopedSeed
|
||||
{
|
||||
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
|
||||
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
|
||||
const parsedUrl = this.parseUrl(url);
|
||||
this.url = parsedUrl.href;
|
||||
this.include = this.parseRx(include);
|
||||
|
@ -17,6 +17,7 @@ class ScopedSeed
|
|||
|
||||
this.sitemap = this.resolveSiteMap(sitemap);
|
||||
this.allowHash = allowHash;
|
||||
this.maxExtraHops = extraHops;
|
||||
this.maxDepth = depth < 0 ? 99999 : depth;
|
||||
}
|
||||
|
||||
|
@ -93,7 +94,7 @@ class ScopedSeed
|
|||
return depth >= this.maxDepth;
|
||||
}
|
||||
|
||||
isIncluded(url, depth) {
|
||||
isIncluded(url, depth, extraHops = 0) {
|
||||
if (depth > this.maxDepth) {
|
||||
return false;
|
||||
}
|
||||
|
@ -125,9 +126,15 @@ class ScopedSeed
|
|||
}
|
||||
}
|
||||
|
||||
let isOOS = false;
|
||||
|
||||
if (!inScope) {
|
||||
//console.log(`Not in scope ${url} ${this.include}`);
|
||||
return false;
|
||||
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
||||
isOOS = true;
|
||||
} else {
|
||||
//console.log(`Not in scope ${url} ${this.include}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// check exclusions
|
||||
|
@ -138,7 +145,7 @@ class ScopedSeed
|
|||
}
|
||||
}
|
||||
|
||||
return url;
|
||||
return {url, isOOS};
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ class BaseState
|
|||
recheckScope(data, seeds) {
|
||||
const seed = seeds[data.seedId];
|
||||
|
||||
return seed.isIncluded(data.url, data.depth);
|
||||
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
18
yarn.lock
18
yarn.lock
|
@ -1152,9 +1152,9 @@ camelcase@^6.0.0:
|
|||
integrity sha512-c7wVvbw3f37nuobQNtgsgG9POC9qMbNuMQmTCqZv23b6MIz0fcYpBiOlv9gEN/hdLdnZTDQhg6e9Dq5M1vKvfg==
|
||||
|
||||
caniuse-lite@^1.0.30001219:
|
||||
version "1.0.30001228"
|
||||
resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001228.tgz#bfdc5942cd3326fa51ee0b42fbef4da9d492a7fa"
|
||||
integrity sha512-QQmLOGJ3DEgokHbMSA8cj2a+geXqmnpyOFT0lhQV6P3/YOJvGDEwoedcwxEQ30gJIwIIunHIicunJ2rzK5gB2A==
|
||||
version "1.0.30001299"
|
||||
resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001299.tgz"
|
||||
integrity sha512-iujN4+x7QzqA2NCSrS5VUy+4gLmRd4xv6vbBBsmfVqTx8bLAD8097euLqQgKxSVLvxjSDcvF1T/i9ocgnUFexw==
|
||||
|
||||
capture-exit@^2.0.0:
|
||||
version "2.0.0"
|
||||
|
@ -3414,7 +3414,7 @@ nice-try@^1.0.4:
|
|||
resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366"
|
||||
integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==
|
||||
|
||||
node-fetch@^2.6.0, node-fetch@^2.6.1:
|
||||
node-fetch@^2.6.1:
|
||||
version "2.6.1"
|
||||
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
|
||||
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
|
||||
|
@ -4811,15 +4811,15 @@ walker@^1.0.7, walker@~1.0.5:
|
|||
dependencies:
|
||||
makeerror "1.0.x"
|
||||
|
||||
warcio@^1.4.3:
|
||||
version "1.4.5"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.5.tgz#24ca61f799185c5d88cdd0a65d279f376b4f9a63"
|
||||
integrity sha512-VwFBdmEQhWHmxsdyiLM0INHD1KZ2+EGYzslZXFe6JdbuTfSF/dYRQ/wEdvp+m28mydphROF6D32KfkIMRU1NZw==
|
||||
warcio@^1.5.0:
|
||||
version "1.5.0"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.5.0.tgz#da80805f36b26c68c3b79e9d1d334f8df746df3e"
|
||||
integrity sha512-80X3IJ0L5OZYRI/5gIjrLzivP/GVWtWrWsNexvSkfeSafoMsXxViywAuotMh4+WzjrcgDA9SGR1Gpg/uXl/9Fw==
|
||||
dependencies:
|
||||
"@peculiar/webcrypto" "^1.1.1"
|
||||
esm "^3.2.25"
|
||||
hi-base32 "^0.5.0"
|
||||
node-fetch "^2.6.0"
|
||||
node-fetch "^2.6.1"
|
||||
pako "^1.0.11"
|
||||
uuid-random "^1.3.0"
|
||||
yargs "^15.3.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue