mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
don't disable extraHops when using sitemaps: (#639)
- instead, exclude sitemap-discovered page URLs from being counted to extra hops rules, eg. if a sitemap page is not in scope, don't include it. -if extraHops is set with sitemaps, only consider extraHops for links for pages that are in scope. - bump version to 1.2.4
This commit is contained in:
parent
1a48b37478
commit
4fb9577d4f
4 changed files with 30 additions and 12 deletions
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.2.3",
|
"version": "1.2.4",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -627,10 +627,23 @@ export class Crawler {
|
||||||
url,
|
url,
|
||||||
depth,
|
depth,
|
||||||
extraHops,
|
extraHops,
|
||||||
}: { seedId: number; url: string; depth: number; extraHops: number },
|
noOOS,
|
||||||
|
}: {
|
||||||
|
seedId: number;
|
||||||
|
url: string;
|
||||||
|
depth: number;
|
||||||
|
extraHops: number;
|
||||||
|
noOOS: boolean;
|
||||||
|
},
|
||||||
logDetails = {},
|
logDetails = {},
|
||||||
) {
|
) {
|
||||||
return this.seeds[seedId].isIncluded(url, depth, extraHops, logDetails);
|
return this.seeds[seedId].isIncluded(
|
||||||
|
url,
|
||||||
|
depth,
|
||||||
|
extraHops,
|
||||||
|
logDetails,
|
||||||
|
noOOS,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async isInScope(
|
async isInScope(
|
||||||
|
@ -1995,7 +2008,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
||||||
|
|
||||||
callbacks.addLink = async (url: string) => {
|
callbacks.addLink = async (url: string) => {
|
||||||
await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails);
|
await this.queueInScopeUrls(
|
||||||
|
seedId,
|
||||||
|
[url],
|
||||||
|
depth,
|
||||||
|
extraHops,
|
||||||
|
false,
|
||||||
|
logDetails,
|
||||||
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
const loadLinks = (options: {
|
const loadLinks = (options: {
|
||||||
|
@ -2071,6 +2091,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
urls: string[],
|
urls: string[],
|
||||||
depth: number,
|
depth: number,
|
||||||
extraHops = 0,
|
extraHops = 0,
|
||||||
|
noOOS = false,
|
||||||
logDetails: LogDetails = {},
|
logDetails: LogDetails = {},
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
|
@ -2081,7 +2102,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
for (const possibleUrl of urls) {
|
for (const possibleUrl of urls) {
|
||||||
const res = this.getScope(
|
const res = this.getScope(
|
||||||
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
|
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId, noOOS },
|
||||||
logDetails,
|
logDetails,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -2330,10 +2351,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
let finished = false;
|
let finished = false;
|
||||||
|
|
||||||
// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
|
|
||||||
// otherwise, all sitemap found URLs would be eligible for additional hops
|
|
||||||
const extraHopsDisabled = this.params.extraHops + 1;
|
|
||||||
|
|
||||||
await new Promise<void>((resolve) => {
|
await new Promise<void>((resolve) => {
|
||||||
sitemapper.on("end", () => {
|
sitemapper.on("end", () => {
|
||||||
resolve();
|
resolve();
|
||||||
|
@ -2361,7 +2378,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
"sitemap",
|
"sitemap",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
|
this.queueInScopeUrls(seedId, [url], 0, 0, true);
|
||||||
if (count >= 100 && !resolved) {
|
if (count >= 100 && !resolved) {
|
||||||
logger.info(
|
logger.info(
|
||||||
"Sitemap partially parsed, continue parsing large sitemap in the background",
|
"Sitemap partially parsed, continue parsing large sitemap in the background",
|
||||||
|
|
|
@ -234,6 +234,7 @@ export class ScopedSeed {
|
||||||
depth: number,
|
depth: number,
|
||||||
extraHops = 0,
|
extraHops = 0,
|
||||||
logDetails = {},
|
logDetails = {},
|
||||||
|
noOOS = false,
|
||||||
): { url: string; isOOS: boolean } | false {
|
): { url: string; isOOS: boolean } | false {
|
||||||
if (depth > this.maxDepth) {
|
if (depth > this.maxDepth) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -272,7 +273,7 @@ export class ScopedSeed {
|
||||||
let isOOS = false;
|
let isOOS = false;
|
||||||
|
|
||||||
if (!inScope) {
|
if (!inScope) {
|
||||||
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
if (!noOOS && this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
||||||
isOOS = true;
|
isOOS = true;
|
||||||
} else {
|
} else {
|
||||||
//console.log(`Not in scope ${url} ${this.include}`);
|
//console.log(`Not in scope ${url} ${this.include}`);
|
||||||
|
|
|
@ -80,6 +80,6 @@ test("test sitemap with application/xml content-type", async () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
|
test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
|
||||||
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
|
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
|
||||||
});
|
});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue