seed url issues: log duplicate seeds or limit reached when adding seeds only once

This commit is contained in:
Ilya Kreymer 2025-09-12 09:55:46 -07:00
parent 8d87293932
commit 13f940aa49

View file

@ -129,6 +129,8 @@ export class Crawler {
limitHit = false;
pageLimit: number;
dupeSeedsFound = false;
saveStateFiles: string[] = [];
lastSaveTime: number;
@ -2466,10 +2468,19 @@ self.__bx_behaviors.selectMainBehavior();
{ url, ...logDetails },
"links",
);
if (!this.limitHit && depth === 0) {
logger.error(
"Page limit reached when adding URL list, some URLs not crawled.",
);
}
this.limitHit = true;
return false;
case QueueState.DUPE_URL:
if (!this.dupeSeedsFound && depth === 0) {
logger.error("Duplicate seed URLs found and skipped");
this.dupeSeedsFound = true;
}
logger.debug(
"Page URL not queued, already seen",
{ url, ...logDetails },