mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
27 lines
563 B
TypeScript
27 lines
563 B
TypeScript
export const HTML_TYPES = [
|
|
"text/html",
|
|
"application/xhtml",
|
|
"application/xhtml+xml",
|
|
];
|
|
export const WAIT_UNTIL_OPTS = [
|
|
"load",
|
|
"domcontentloaded",
|
|
"networkidle0",
|
|
"networkidle2",
|
|
];
|
|
|
|
export const DETECT_SITEMAP = "<detect>";
|
|
|
|
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
|
|
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
|
export const ADD_LINK_FUNC = "__bx_addLink";
|
|
export const MAX_DEPTH = 1000000;
|
|
|
|
export const DEFAULT_SELECTORS = [
|
|
{
|
|
selector: "a[href]",
|
|
extract: "href",
|
|
isAttribute: false,
|
|
},
|
|
];
|