mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
additional logging, resolve relative sitemap urls, eg. '/sitemap.xml' in robots.txt
This commit is contained in:
parent
42883b1da8
commit
4c1ee2d2e4
1 changed files with 19 additions and 6 deletions
|
|
@ -134,20 +134,31 @@ export class SitemapReader extends EventEmitter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async parseRobotsForSitemap(url: string) {
|
private async parseRobotsForSitemap(robotsUrl: string) {
|
||||||
let sitemapFound = false;
|
let sitemapFound = false;
|
||||||
try {
|
try {
|
||||||
const resp = await this._fetchWithRetry(url, TEXT_CONTENT_TYPE);
|
logger.debug(
|
||||||
|
"Sitemap: Parsing robots to detect sitemap",
|
||||||
|
{ url: robotsUrl },
|
||||||
|
"sitemap",
|
||||||
|
);
|
||||||
|
const resp = await this._fetchWithRetry(robotsUrl, TEXT_CONTENT_TYPE);
|
||||||
if (!resp) {
|
if (!resp) {
|
||||||
return sitemapFound;
|
return sitemapFound;
|
||||||
}
|
}
|
||||||
|
|
||||||
const text = await resp.text();
|
const text = await resp.text();
|
||||||
|
|
||||||
text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, url) => {
|
text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, urlStr) => {
|
||||||
this.addNewSitemap(url, null);
|
try {
|
||||||
sitemapFound = true;
|
const url = new URL(urlStr, robotsUrl).href;
|
||||||
return url;
|
logger.debug("Sitemap: Added from robots", { url }, "sitemap");
|
||||||
|
this.addNewSitemap(url, null);
|
||||||
|
sitemapFound = true;
|
||||||
|
} catch (e) {
|
||||||
|
// ignore invalid
|
||||||
|
}
|
||||||
|
return urlStr;
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
//
|
//
|
||||||
|
|
@ -159,6 +170,8 @@ export class SitemapReader extends EventEmitter {
|
||||||
try {
|
try {
|
||||||
this.seenSitemapSet.add(url);
|
this.seenSitemapSet.add(url);
|
||||||
|
|
||||||
|
logger.debug("Parsing sitemap XML", url, "sitemap");
|
||||||
|
|
||||||
const resp = await this._fetchWithRetry(url);
|
const resp = await this._fetchWithRetry(url);
|
||||||
if (!resp) {
|
if (!resp) {
|
||||||
return;
|
return;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue