don't disable extraHops when using sitemaps: (#639)

- instead, exclude sitemap-discovered page URLs from being counted to extra hops rules, eg. if a sitemap page is not in scope, don't include it.
-if extraHops is set with sitemaps, only consider extraHops for links for pages that are in scope.
- bump version to 1.2.4
This commit is contained in:
Ilya Kreymer 2024-07-11 19:48:43 -07:00 committed by GitHub
parent 1a48b37478
commit 4fb9577d4f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 30 additions and 12 deletions

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.2.3",
"version": "1.2.4",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -627,10 +627,23 @@ export class Crawler {
url,
depth,
extraHops,
}: { seedId: number; url: string; depth: number; extraHops: number },
noOOS,
}: {
seedId: number;
url: string;
depth: number;
extraHops: number;
noOOS: boolean;
},
logDetails = {},
) {
return this.seeds[seedId].isIncluded(url, depth, extraHops, logDetails);
return this.seeds[seedId].isIncluded(
url,
depth,
extraHops,
logDetails,
noOOS,
);
}
async isInScope(
@ -1995,7 +2008,14 @@ self.__bx_behaviors.selectMainBehavior();
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
callbacks.addLink = async (url: string) => {
await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails);
await this.queueInScopeUrls(
seedId,
[url],
depth,
extraHops,
false,
logDetails,
);
};
const loadLinks = (options: {
@ -2071,6 +2091,7 @@ self.__bx_behaviors.selectMainBehavior();
urls: string[],
depth: number,
extraHops = 0,
noOOS = false,
logDetails: LogDetails = {},
) {
try {
@ -2081,7 +2102,7 @@ self.__bx_behaviors.selectMainBehavior();
for (const possibleUrl of urls) {
const res = this.getScope(
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId, noOOS },
logDetails,
);
@ -2330,10 +2351,6 @@ self.__bx_behaviors.selectMainBehavior();
let finished = false;
// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
// otherwise, all sitemap found URLs would be eligible for additional hops
const extraHopsDisabled = this.params.extraHops + 1;
await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
@ -2361,7 +2378,7 @@ self.__bx_behaviors.selectMainBehavior();
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
this.queueInScopeUrls(seedId, [url], 0, 0, true);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",

View file

@ -234,6 +234,7 @@ export class ScopedSeed {
depth: number,
extraHops = 0,
logDetails = {},
noOOS = false,
): { url: string; isOOS: boolean } | false {
if (depth > this.maxDepth) {
return false;
@ -272,7 +273,7 @@ export class ScopedSeed {
let isOOS = false;
if (!inScope) {
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
if (!noOOS && this.maxExtraHops && extraHops <= this.maxExtraHops) {
isOOS = true;
} else {
//console.log(`Not in scope ${url} ${this.include}`);

View file

@ -80,6 +80,6 @@ test("test sitemap with application/xml content-type", async () => {
});
test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
});