mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

* Surface lastmod option for sitemap parser - Add --sitemapFromDate to use along with --useSitemap which will filter sitemap by on or after specified ISO date. The library used to parse sitemaps for URLs added an optional "lastmod" argument in v3.2.5 that allows filtering URLs returned by a "last_modified" element present in sitemap XMLs. This surfaces that argument to the browsertrix-crawler CLI runtime parameters. This can be useful for orienting a crawl around a list of seeds known to contain sitemaps, but are only interested in including URLs that have been modified on or after X date. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
40 lines
1 KiB
JSON
40 lines
1 KiB
JSON
{
|
|
"name": "browsertrix-crawler",
|
|
"version": "0.10.5",
|
|
"main": "browsertrix-crawler",
|
|
"type": "module",
|
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
|
"license": "AGPL-3.0-or-later",
|
|
"scripts": {
|
|
"lint": "eslint *.js util/*.js tests/*.test.js",
|
|
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
|
|
"prepare": "husky install"
|
|
},
|
|
"dependencies": {
|
|
"@novnc/novnc": "^1.4.0",
|
|
"browsertrix-behaviors": "^0.5.1",
|
|
"get-folder-size": "^4.0.0",
|
|
"husky": "^8.0.3",
|
|
"ioredis": "^4.27.1",
|
|
"js-yaml": "^4.1.0",
|
|
"minio": "7.0.26",
|
|
"puppeteer-core": "^20.7.4",
|
|
"sharp": "^0.32.1",
|
|
"sitemapper": "^3.2.5",
|
|
"uuid": "8.3.2",
|
|
"warcio": "^1.6.0",
|
|
"ws": "^7.4.4",
|
|
"yargs": "^17.7.2"
|
|
},
|
|
"devDependencies": {
|
|
"eslint": "^7.20.0",
|
|
"eslint-plugin-react": "^7.22.0",
|
|
"jest": "^29.2.1",
|
|
"md5": "^2.3.0"
|
|
},
|
|
"jest": {
|
|
"transform": {},
|
|
"testTimeout": 90000
|
|
}
|
|
}
|