Surface lastmod option for sitemap parser (#367)

* Surface lastmod option for sitemap parser
- Add --sitemapFromDate to use along with --useSitemap which will filter sitemap by on or after
specified ISO date.

The library used to parse sitemaps for URLs added an optional
"lastmod" argument in v3.2.5 that allows filtering URLs returned
by a "last_modified" element present in sitemap XMLs.  This
surfaces that argument to the browsertrix-crawler CLI runtime
parameters.

This can be useful for orienting a crawl around a list of seeds
known to contain sitemaps, but are only interested in including
URLs that have been modified on or after X date.

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Graham Hukill 2023-09-13 13:20:41 -04:00 committed by GitHub
parent f8508a85ab
commit 1eeee2c215
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 26 additions and 5 deletions

View file

@ -162,6 +162,10 @@ Options:
--useSitemap, --sitemap If enabled, check for sitemaps at /s
itemap.xml, or custom URL if URL is
specified
--sitemapFromDate, --sitemapFrom If set, filter URLs from sitemaps to
those greater than or equal to prov
ided ISO Date string (YYYY-MM-DD or
YYYY-MM-DDTHH:MM:SS or partial date)
--statsFilename If set, output stats as JSON to this
file. (Relative filename resolves t
o crawl working directory)

View file

@ -794,7 +794,7 @@ self.__bx_behaviors.selectMainBehavior();
}
if (seed.sitemap) {
await this.parseSitemap(seed.sitemap, i);
await this.parseSitemap(seed.sitemap, i, this.params.sitemapFromDate);
}
}
@ -1438,18 +1438,30 @@ self.__bx_behaviors.selectMainBehavior();
}
}
async parseSitemap(url, seedId) {
async parseSitemap(url, seedId, sitemapFromDate) {
// handle sitemap last modified date if passed
let lastmodFromTimestamp = null;
const dateObj = new Date(sitemapFromDate);
if (isNaN(dateObj.getTime())) {
logger.info("Fetching full sitemap (fromDate not specified/valid)", {url, sitemapFromDate}, "sitemap");
} else {
lastmodFromTimestamp = dateObj.getTime();
logger.info("Fetching and filtering sitemap by date", {url, sitemapFromDate}, "sitemap");
}
const sitemapper = new Sitemapper({
url,
timeout: 15000,
requestHeaders: this.headers
requestHeaders: this.headers,
lastmod: lastmodFromTimestamp
});
try {
const { sites } = await sitemapper.fetch();
logger.info("Sitemap Urls Found", {urls: sites.length}, "sitemap");
await this.queueInScopeUrls(seedId, sites, 0);
} catch(e) {
logger.warn("Error fetching sites from sitemap", e);
logger.warn("Error fetching sites from sitemap", e, "sitemap");
}
}

View file

@ -21,7 +21,7 @@
"minio": "7.0.26",
"puppeteer-core": "^20.7.4",
"sharp": "^0.32.1",
"sitemapper": "^3.1.2",
"sitemapper": "^3.2.5",
"uuid": "8.3.2",
"warcio": "^1.6.0",
"ws": "^7.4.4",

View file

@ -227,6 +227,11 @@ class ArgParser {
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
},
"sitemapFromDate": {
alias: "sitemapFrom",
describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
},
"statsFilename": {
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
},