mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
link extraction optimization: for scopeType page, set depth == extraHops to avoid getting links (#364)
if we know no additional links wil be used
This commit is contained in:
parent
cf404efa13
commit
3c2f5f8934
3 changed files with 8 additions and 1 deletions
|
@ -1142,6 +1142,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||
logger.debug("Skipping Link Extraction, At Max Depth");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.10.4",
|
||||
"version": "0.10.5",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
|
@ -23,6 +23,12 @@ export class ScopedSeed
|
|||
this.include = [...include, ...this.include];
|
||||
}
|
||||
|
||||
// for page scope, the depth is set to extraHops, as no other
|
||||
// crawling is done
|
||||
if (this.scopeType === "page") {
|
||||
depth = extraHops;
|
||||
}
|
||||
|
||||
this.sitemap = this.resolveSiteMap(sitemap);
|
||||
this.allowHash = allowHash;
|
||||
this.maxExtraHops = extraHops;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue