additional logging, resolve relative sitemap urls, eg. '/sitemap.xml' in robots.txt

This commit is contained in:
Ilya Kreymer 2025-12-04 16:03:08 -08:00
parent 42883b1da8
commit 4c1ee2d2e4

View file

@ -134,20 +134,31 @@ export class SitemapReader extends EventEmitter {
} }
} }
private async parseRobotsForSitemap(url: string) { private async parseRobotsForSitemap(robotsUrl: string) {
let sitemapFound = false; let sitemapFound = false;
try { try {
const resp = await this._fetchWithRetry(url, TEXT_CONTENT_TYPE); logger.debug(
"Sitemap: Parsing robots to detect sitemap",
{ url: robotsUrl },
"sitemap",
);
const resp = await this._fetchWithRetry(robotsUrl, TEXT_CONTENT_TYPE);
if (!resp) { if (!resp) {
return sitemapFound; return sitemapFound;
} }
const text = await resp.text(); const text = await resp.text();
text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, url) => { text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, urlStr) => {
this.addNewSitemap(url, null); try {
sitemapFound = true; const url = new URL(urlStr, robotsUrl).href;
return url; logger.debug("Sitemap: Added from robots", { url }, "sitemap");
this.addNewSitemap(url, null);
sitemapFound = true;
} catch (e) {
// ignore invalid
}
return urlStr;
}); });
} catch (e) { } catch (e) {
// //
@ -159,6 +170,8 @@ export class SitemapReader extends EventEmitter {
try { try {
this.seenSitemapSet.add(url); this.seenSitemapSet.add(url);
logger.debug("Parsing sitemap XML", url, "sitemap");
const resp = await this._fetchWithRetry(url); const resp = await this._fetchWithRetry(url);
if (!resp) { if (!resp) {
return; return;