link extraction optimization: for scopeType page, set depth == extraHops to avoid getting links (#364)

if we know no additional links wil be used
This commit is contained in:
Ilya Kreymer 2023-08-31 13:42:14 -07:00 committed by GitHub
parent cf404efa13
commit 3c2f5f8934
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 1 deletions

View file

@ -1142,6 +1142,7 @@ self.__bx_behaviors.selectMainBehavior();
// skip extraction if at max depth
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
logger.debug("Skipping Link Extraction, At Max Depth");
return;
}

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.10.4",
"version": "0.10.5",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -23,6 +23,12 @@ export class ScopedSeed
this.include = [...include, ...this.include];
}
// for page scope, the depth is set to extraHops, as no other
// crawling is done
if (this.scopeType === "page") {
depth = extraHops;
}
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;