From 4c1ee2d2e4f718655ae81b5d4674222a5858325c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 4 Dec 2025 16:03:08 -0800 Subject: [PATCH] additional logging, resolve relative sitemap urls, eg. '/sitemap.xml' in robots.txt --- src/util/sitemapper.ts | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 65f3d640..494462f6 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -134,20 +134,31 @@ export class SitemapReader extends EventEmitter { } } - private async parseRobotsForSitemap(url: string) { + private async parseRobotsForSitemap(robotsUrl: string) { let sitemapFound = false; try { - const resp = await this._fetchWithRetry(url, TEXT_CONTENT_TYPE); + logger.debug( + "Sitemap: Parsing robots to detect sitemap", + { url: robotsUrl }, + "sitemap", + ); + const resp = await this._fetchWithRetry(robotsUrl, TEXT_CONTENT_TYPE); if (!resp) { return sitemapFound; } const text = await resp.text(); - text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, url) => { - this.addNewSitemap(url, null); - sitemapFound = true; - return url; + text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, urlStr) => { + try { + const url = new URL(urlStr, robotsUrl).href; + logger.debug("Sitemap: Added from robots", { url }, "sitemap"); + this.addNewSitemap(url, null); + sitemapFound = true; + } catch (e) { + // ignore invalid + } + return urlStr; }); } catch (e) { // @@ -159,6 +170,8 @@ export class SitemapReader extends EventEmitter { try { this.seenSitemapSet.add(url); + logger.debug("Parsing sitemap XML", url, "sitemap"); + const resp = await this._fetchWithRetry(url); if (!resp) { return;