2023-11-09 19:11:11 -05:00
|
|
|
export const HTML_TYPES = [
|
|
|
|
"text/html",
|
|
|
|
"application/xhtml",
|
|
|
|
"application/xhtml+xml",
|
|
|
|
];
|
|
|
|
export const WAIT_UNTIL_OPTS = [
|
|
|
|
"load",
|
|
|
|
"domcontentloaded",
|
|
|
|
"networkidle0",
|
|
|
|
"networkidle2",
|
|
|
|
];
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
|
2024-03-22 13:37:14 -07:00
|
|
|
export const SERVICE_WORKER_OPTS = [
|
|
|
|
"disabled",
|
|
|
|
"disabled-if-profile",
|
|
|
|
"enabled",
|
|
|
|
] as const;
|
|
|
|
|
|
|
|
export type ServiceWorkerOpt = (typeof SERVICE_WORKER_OPTS)[number];
|
|
|
|
|
SAX-based sitemap parser (#497)
Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser
Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.
Fixes #496
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-18 19:14:07 -07:00
|
|
|
export const DETECT_SITEMAP = "<detect>";
|
|
|
|
|
2023-10-31 23:05:30 -07:00
|
|
|
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
2023-09-15 10:12:08 -07:00
|
|
|
export const ADD_LINK_FUNC = "__bx_addLink";
|
2023-03-17 14:24:44 -07:00
|
|
|
export const MAX_DEPTH = 1000000;
|
2021-06-23 19:36:32 -07:00
|
|
|
|
2024-09-05 13:28:49 -07:00
|
|
|
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
|
|
|
export const PAGE_OP_TIMEOUT_SECS = 5;
|
|
|
|
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
export const DEFAULT_SELECTORS = [
|
|
|
|
{
|
|
|
|
selector: "a[href]",
|
|
|
|
extract: "href",
|
|
|
|
isAttribute: false,
|
|
|
|
},
|
|
|
|
];
|
2024-06-25 13:53:43 -07:00
|
|
|
|
|
|
|
export const DISPLAY = ":99";
|