mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Surface lastmod option for sitemap parser (#367)
* Surface lastmod option for sitemap parser - Add --sitemapFromDate to use along with --useSitemap which will filter sitemap by on or after specified ISO date. The library used to parse sitemaps for URLs added an optional "lastmod" argument in v3.2.5 that allows filtering URLs returned by a "last_modified" element present in sitemap XMLs. This surfaces that argument to the browsertrix-crawler CLI runtime parameters. This can be useful for orienting a crawl around a list of seeds known to contain sitemaps, but are only interested in including URLs that have been modified on or after X date. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
f8508a85ab
commit
1eeee2c215
4 changed files with 26 additions and 5 deletions
|
@ -162,6 +162,10 @@ Options:
|
|||
--useSitemap, --sitemap If enabled, check for sitemaps at /s
|
||||
itemap.xml, or custom URL if URL is
|
||||
specified
|
||||
--sitemapFromDate, --sitemapFrom If set, filter URLs from sitemaps to
|
||||
those greater than or equal to prov
|
||||
ided ISO Date string (YYYY-MM-DD or
|
||||
YYYY-MM-DDTHH:MM:SS or partial date)
|
||||
--statsFilename If set, output stats as JSON to this
|
||||
file. (Relative filename resolves t
|
||||
o crawl working directory)
|
||||
|
|
20
crawler.js
20
crawler.js
|
@ -794,7 +794,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
if (seed.sitemap) {
|
||||
await this.parseSitemap(seed.sitemap, i);
|
||||
await this.parseSitemap(seed.sitemap, i, this.params.sitemapFromDate);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1438,18 +1438,30 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
async parseSitemap(url, seedId) {
|
||||
async parseSitemap(url, seedId, sitemapFromDate) {
|
||||
// handle sitemap last modified date if passed
|
||||
let lastmodFromTimestamp = null;
|
||||
const dateObj = new Date(sitemapFromDate);
|
||||
if (isNaN(dateObj.getTime())) {
|
||||
logger.info("Fetching full sitemap (fromDate not specified/valid)", {url, sitemapFromDate}, "sitemap");
|
||||
} else {
|
||||
lastmodFromTimestamp = dateObj.getTime();
|
||||
logger.info("Fetching and filtering sitemap by date", {url, sitemapFromDate}, "sitemap");
|
||||
}
|
||||
|
||||
const sitemapper = new Sitemapper({
|
||||
url,
|
||||
timeout: 15000,
|
||||
requestHeaders: this.headers
|
||||
requestHeaders: this.headers,
|
||||
lastmod: lastmodFromTimestamp
|
||||
});
|
||||
|
||||
try {
|
||||
const { sites } = await sitemapper.fetch();
|
||||
logger.info("Sitemap Urls Found", {urls: sites.length}, "sitemap");
|
||||
await this.queueInScopeUrls(seedId, sites, 0);
|
||||
} catch(e) {
|
||||
logger.warn("Error fetching sites from sitemap", e);
|
||||
logger.warn("Error fetching sites from sitemap", e, "sitemap");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
"minio": "7.0.26",
|
||||
"puppeteer-core": "^20.7.4",
|
||||
"sharp": "^0.32.1",
|
||||
"sitemapper": "^3.1.2",
|
||||
"sitemapper": "^3.2.5",
|
||||
"uuid": "8.3.2",
|
||||
"warcio": "^1.6.0",
|
||||
"ws": "^7.4.4",
|
||||
|
|
|
@ -227,6 +227,11 @@ class ArgParser {
|
|||
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||
},
|
||||
|
||||
"sitemapFromDate": {
|
||||
alias: "sitemapFrom",
|
||||
describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
|
||||
},
|
||||
|
||||
"statsFilename": {
|
||||
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
|
||||
},
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue