don't disable extraHops when using sitemaps: (#639)

- instead, exclude sitemap-discovered page URLs from being counted to extra hops rules, eg. if a sitemap page is not in scope, don't include it.
-if extraHops is set with sitemaps, only consider extraHops for links for pages that are in scope.
- bump version to 1.2.4
This commit is contained in:
Ilya Kreymer 2024-07-11 19:48:43 -07:00 committed by GitHub
parent 1a48b37478
commit 4fb9577d4f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 30 additions and 12 deletions

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "1.2.3", "version": "1.2.4",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"type": "module", "type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -627,10 +627,23 @@ export class Crawler {
url, url,
depth, depth,
extraHops, extraHops,
}: { seedId: number; url: string; depth: number; extraHops: number }, noOOS,
}: {
seedId: number;
url: string;
depth: number;
extraHops: number;
noOOS: boolean;
},
logDetails = {}, logDetails = {},
) { ) {
return this.seeds[seedId].isIncluded(url, depth, extraHops, logDetails); return this.seeds[seedId].isIncluded(
url,
depth,
extraHops,
logDetails,
noOOS,
);
} }
async isInScope( async isInScope(
@ -1995,7 +2008,14 @@ self.__bx_behaviors.selectMainBehavior();
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data; const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
callbacks.addLink = async (url: string) => { callbacks.addLink = async (url: string) => {
await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails); await this.queueInScopeUrls(
seedId,
[url],
depth,
extraHops,
false,
logDetails,
);
}; };
const loadLinks = (options: { const loadLinks = (options: {
@ -2071,6 +2091,7 @@ self.__bx_behaviors.selectMainBehavior();
urls: string[], urls: string[],
depth: number, depth: number,
extraHops = 0, extraHops = 0,
noOOS = false,
logDetails: LogDetails = {}, logDetails: LogDetails = {},
) { ) {
try { try {
@ -2081,7 +2102,7 @@ self.__bx_behaviors.selectMainBehavior();
for (const possibleUrl of urls) { for (const possibleUrl of urls) {
const res = this.getScope( const res = this.getScope(
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId }, { url: possibleUrl, extraHops: newExtraHops, depth, seedId, noOOS },
logDetails, logDetails,
); );
@ -2330,10 +2351,6 @@ self.__bx_behaviors.selectMainBehavior();
let finished = false; let finished = false;
// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
// otherwise, all sitemap found URLs would be eligible for additional hops
const extraHopsDisabled = this.params.extraHops + 1;
await new Promise<void>((resolve) => { await new Promise<void>((resolve) => {
sitemapper.on("end", () => { sitemapper.on("end", () => {
resolve(); resolve();
@ -2361,7 +2378,7 @@ self.__bx_behaviors.selectMainBehavior();
"sitemap", "sitemap",
); );
} }
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled); this.queueInScopeUrls(seedId, [url], 0, 0, true);
if (count >= 100 && !resolved) { if (count >= 100 && !resolved) {
logger.info( logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background", "Sitemap partially parsed, continue parsing large sitemap in the background",

View file

@ -234,6 +234,7 @@ export class ScopedSeed {
depth: number, depth: number,
extraHops = 0, extraHops = 0,
logDetails = {}, logDetails = {},
noOOS = false,
): { url: string; isOOS: boolean } | false { ): { url: string; isOOS: boolean } | false {
if (depth > this.maxDepth) { if (depth > this.maxDepth) {
return false; return false;
@ -272,7 +273,7 @@ export class ScopedSeed {
let isOOS = false; let isOOS = false;
if (!inScope) { if (!inScope) {
if (this.maxExtraHops && extraHops <= this.maxExtraHops) { if (!noOOS && this.maxExtraHops && extraHops <= this.maxExtraHops) {
isOOS = true; isOOS = true;
} else { } else {
//console.log(`Not in scope ${url} ${this.include}`); //console.log(`Not in scope ${url} ${this.include}`);

View file

@ -80,6 +80,6 @@ test("test sitemap with application/xml content-type", async () => {
}); });
test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => { test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page"); await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
}); });