From 4fb9577d4f3e532f2ff1bcc102701d2e788f313d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 11 Jul 2024 19:48:43 -0700 Subject: [PATCH] don't disable extraHops when using sitemaps: (#639) - instead, exclude sitemap-discovered page URLs from being counted to extra hops rules, eg. if a sitemap page is not in scope, don't include it. -if extraHops is set with sitemaps, only consider extraHops for links for pages that are in scope. - bump version to 1.2.4 --- package.json | 2 +- src/crawler.ts | 35 ++++++++++++++++++++++++++--------- src/util/seeds.ts | 3 ++- tests/sitemap-parse.test.js | 2 +- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/package.json b/package.json index ae9aab5e..e6801362 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.2.3", + "version": "1.2.4", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index 26a28f97..cf940e83 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -627,10 +627,23 @@ export class Crawler { url, depth, extraHops, - }: { seedId: number; url: string; depth: number; extraHops: number }, + noOOS, + }: { + seedId: number; + url: string; + depth: number; + extraHops: number; + noOOS: boolean; + }, logDetails = {}, ) { - return this.seeds[seedId].isIncluded(url, depth, extraHops, logDetails); + return this.seeds[seedId].isIncluded( + url, + depth, + extraHops, + logDetails, + noOOS, + ); } async isInScope( @@ -1995,7 +2008,14 @@ self.__bx_behaviors.selectMainBehavior(); const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data; callbacks.addLink = async (url: string) => { - await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails); + await this.queueInScopeUrls( + seedId, + [url], + depth, + extraHops, + false, + logDetails, + ); }; const loadLinks = (options: { @@ -2071,6 +2091,7 @@ self.__bx_behaviors.selectMainBehavior(); urls: string[], depth: number, extraHops = 0, + noOOS = false, logDetails: LogDetails = {}, ) { try { @@ -2081,7 +2102,7 @@ self.__bx_behaviors.selectMainBehavior(); for (const possibleUrl of urls) { const res = this.getScope( - { url: possibleUrl, extraHops: newExtraHops, depth, seedId }, + { url: possibleUrl, extraHops: newExtraHops, depth, seedId, noOOS }, logDetails, ); @@ -2330,10 +2351,6 @@ self.__bx_behaviors.selectMainBehavior(); let finished = false; - // disable extraHops for sitemap found URLs by setting to extraHops limit + 1 - // otherwise, all sitemap found URLs would be eligible for additional hops - const extraHopsDisabled = this.params.extraHops + 1; - await new Promise((resolve) => { sitemapper.on("end", () => { resolve(); @@ -2361,7 +2378,7 @@ self.__bx_behaviors.selectMainBehavior(); "sitemap", ); } - this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled); + this.queueInScopeUrls(seedId, [url], 0, 0, true); if (count >= 100 && !resolved) { logger.info( "Sitemap partially parsed, continue parsing large sitemap in the background", diff --git a/src/util/seeds.ts b/src/util/seeds.ts index 66898cc8..61b12cfb 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -234,6 +234,7 @@ export class ScopedSeed { depth: number, extraHops = 0, logDetails = {}, + noOOS = false, ): { url: string; isOOS: boolean } | false { if (depth > this.maxDepth) { return false; @@ -272,7 +273,7 @@ export class ScopedSeed { let isOOS = false; if (!inScope) { - if (this.maxExtraHops && extraHops <= this.maxExtraHops) { + if (!noOOS && this.maxExtraHops && extraHops <= this.maxExtraHops) { isOOS = true; } else { //console.log(`Not in scope ${url} ${this.include}`); diff --git a/tests/sitemap-parse.test.js b/tests/sitemap-parse.test.js index 8ede14d8..1caa35d5 100644 --- a/tests/sitemap-parse.test.js +++ b/tests/sitemap-parse.test.js @@ -80,6 +80,6 @@ test("test sitemap with application/xml content-type", async () => { }); -test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => { +test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => { await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page"); });