mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
don't disable extraHops when using sitemaps: (#639)
- instead, exclude sitemap-discovered page URLs from being counted to extra hops rules, eg. if a sitemap page is not in scope, don't include it. -if extraHops is set with sitemaps, only consider extraHops for links for pages that are in scope. - bump version to 1.2.4
This commit is contained in:
parent
1a48b37478
commit
4fb9577d4f
4 changed files with 30 additions and 12 deletions
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.2.3",
|
||||
"version": "1.2.4",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
|
@ -627,10 +627,23 @@ export class Crawler {
|
|||
url,
|
||||
depth,
|
||||
extraHops,
|
||||
}: { seedId: number; url: string; depth: number; extraHops: number },
|
||||
noOOS,
|
||||
}: {
|
||||
seedId: number;
|
||||
url: string;
|
||||
depth: number;
|
||||
extraHops: number;
|
||||
noOOS: boolean;
|
||||
},
|
||||
logDetails = {},
|
||||
) {
|
||||
return this.seeds[seedId].isIncluded(url, depth, extraHops, logDetails);
|
||||
return this.seeds[seedId].isIncluded(
|
||||
url,
|
||||
depth,
|
||||
extraHops,
|
||||
logDetails,
|
||||
noOOS,
|
||||
);
|
||||
}
|
||||
|
||||
async isInScope(
|
||||
|
@ -1995,7 +2008,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
||||
|
||||
callbacks.addLink = async (url: string) => {
|
||||
await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails);
|
||||
await this.queueInScopeUrls(
|
||||
seedId,
|
||||
[url],
|
||||
depth,
|
||||
extraHops,
|
||||
false,
|
||||
logDetails,
|
||||
);
|
||||
};
|
||||
|
||||
const loadLinks = (options: {
|
||||
|
@ -2071,6 +2091,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
urls: string[],
|
||||
depth: number,
|
||||
extraHops = 0,
|
||||
noOOS = false,
|
||||
logDetails: LogDetails = {},
|
||||
) {
|
||||
try {
|
||||
|
@ -2081,7 +2102,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
for (const possibleUrl of urls) {
|
||||
const res = this.getScope(
|
||||
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
|
||||
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId, noOOS },
|
||||
logDetails,
|
||||
);
|
||||
|
||||
|
@ -2330,10 +2351,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
let finished = false;
|
||||
|
||||
// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
|
||||
// otherwise, all sitemap found URLs would be eligible for additional hops
|
||||
const extraHopsDisabled = this.params.extraHops + 1;
|
||||
|
||||
await new Promise<void>((resolve) => {
|
||||
sitemapper.on("end", () => {
|
||||
resolve();
|
||||
|
@ -2361,7 +2378,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"sitemap",
|
||||
);
|
||||
}
|
||||
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
|
||||
this.queueInScopeUrls(seedId, [url], 0, 0, true);
|
||||
if (count >= 100 && !resolved) {
|
||||
logger.info(
|
||||
"Sitemap partially parsed, continue parsing large sitemap in the background",
|
||||
|
|
|
@ -234,6 +234,7 @@ export class ScopedSeed {
|
|||
depth: number,
|
||||
extraHops = 0,
|
||||
logDetails = {},
|
||||
noOOS = false,
|
||||
): { url: string; isOOS: boolean } | false {
|
||||
if (depth > this.maxDepth) {
|
||||
return false;
|
||||
|
@ -272,7 +273,7 @@ export class ScopedSeed {
|
|||
let isOOS = false;
|
||||
|
||||
if (!inScope) {
|
||||
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
||||
if (!noOOS && this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
||||
isOOS = true;
|
||||
} else {
|
||||
//console.log(`Not in scope ${url} ${this.include}`);
|
||||
|
|
|
@ -80,6 +80,6 @@ test("test sitemap with application/xml content-type", async () => {
|
|||
});
|
||||
|
||||
|
||||
test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
|
||||
test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
|
||||
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue