mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

- use DOMSnapshot.captureSnapshot instead of older DOM.getDocument to get the snapshot (consistent with ArchiveWeb.page) - should be slightly more performant - keep option to use DOM.getDocument - refactor warc resource writing to separate class, used by text extraction and screenshots - write extracted text to WARC files as 'urn:text:<url>' after page loads, similar to screenshots - also store final text to WARC as 'urn:textFinal:<url>' if it is different - cli options: update `--text` to take one more more comma-separated string options `--text to-warc,to-pages,final-to-warc`. For backwards compatibility, support `--text` and `--text true` to be equivalent to `--text to-pages`. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
15 lines
487 B
JavaScript
15 lines
487 B
JavaScript
|
|
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
|
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
|
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
|
|
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
|
export const ADD_LINK_FUNC = "__bx_addLink";
|
|
export const MAX_DEPTH = 1000000;
|
|
|
|
export const DEFAULT_SELECTORS = [{
|
|
selector: "a[href]",
|
|
extract: "href",
|
|
isAttribute: false
|
|
}];
|
|
|