mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-07 13:49:47 +00:00
additional logging, resolve relative sitemap urls, eg. '/sitemap.xml' in robots.txt
This commit is contained in:
parent
42883b1da8
commit
4c1ee2d2e4
1 changed files with 19 additions and 6 deletions
|
|
@ -134,20 +134,31 @@ export class SitemapReader extends EventEmitter {
|
|||
}
|
||||
}
|
||||
|
||||
private async parseRobotsForSitemap(url: string) {
|
||||
private async parseRobotsForSitemap(robotsUrl: string) {
|
||||
let sitemapFound = false;
|
||||
try {
|
||||
const resp = await this._fetchWithRetry(url, TEXT_CONTENT_TYPE);
|
||||
logger.debug(
|
||||
"Sitemap: Parsing robots to detect sitemap",
|
||||
{ url: robotsUrl },
|
||||
"sitemap",
|
||||
);
|
||||
const resp = await this._fetchWithRetry(robotsUrl, TEXT_CONTENT_TYPE);
|
||||
if (!resp) {
|
||||
return sitemapFound;
|
||||
}
|
||||
|
||||
const text = await resp.text();
|
||||
|
||||
text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, url) => {
|
||||
this.addNewSitemap(url, null);
|
||||
sitemapFound = true;
|
||||
return url;
|
||||
text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, urlStr) => {
|
||||
try {
|
||||
const url = new URL(urlStr, robotsUrl).href;
|
||||
logger.debug("Sitemap: Added from robots", { url }, "sitemap");
|
||||
this.addNewSitemap(url, null);
|
||||
sitemapFound = true;
|
||||
} catch (e) {
|
||||
// ignore invalid
|
||||
}
|
||||
return urlStr;
|
||||
});
|
||||
} catch (e) {
|
||||
//
|
||||
|
|
@ -159,6 +170,8 @@ export class SitemapReader extends EventEmitter {
|
|||
try {
|
||||
this.seenSitemapSet.add(url);
|
||||
|
||||
logger.debug("Parsing sitemap XML", url, "sitemap");
|
||||
|
||||
const resp = await this._fetchWithRetry(url);
|
||||
if (!resp) {
|
||||
return;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue