mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Better tracking of failed requests + logging context exclude (#485)
- add --logExcludeContext for log contexts that should be excluded (while --logContext specifies which are to be included) - enable 'recorderNetwork' logging for debugging CDP network - create default log context exclude list (containing: screencast, recorderNetwork, jsErrors), customizable via --logExcludeContext recorder: Track failed requests and include in pageinfo records with status code 0 - cleanup cdp handler methods - intercept requestWillBeSent to track requests that started (but may not complete) - fix shouldSkip() still working if no url is provided (eg. check only headers) - set status to 0 for async fetch failures - remove responseServedFromCache interception, as response data generally not available then, and responseReceived is still called - pageinfo: include page requests that failed with status code 0, also include 'error' status if available. - ensure page is closed on failure - ensure pageinfo still written even if nothing else is crawled for a page - track cached responses, add to debug logging (can also add to pageinfo later if needed) tests: add pageinfo test for crawling invalid URL, which should still result in pageinfo record with status code 0 bump to 1.0.0-beta.7
This commit is contained in:
parent
65133c9d9d
commit
9f18a49c0a
8 changed files with 299 additions and 131 deletions
|
@ -15,7 +15,11 @@ import {
|
|||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
import { screenshotTypes } from "./screenshots.js";
|
||||
import { LOG_CONTEXT_TYPES, logger } from "./logger.js";
|
||||
import {
|
||||
DEFAULT_EXCLUDE_LOG_CONTEXTS,
|
||||
LOG_CONTEXT_TYPES,
|
||||
logger,
|
||||
} from "./logger.js";
|
||||
|
||||
// ============================================================================
|
||||
class ArgParser {
|
||||
|
@ -225,6 +229,14 @@ class ArgParser {
|
|||
coerce,
|
||||
},
|
||||
|
||||
logExcludeContext: {
|
||||
describe: "Comma-separated list of contexts to NOT include in logs",
|
||||
type: "array",
|
||||
default: DEFAULT_EXCLUDE_LOG_CONTEXTS,
|
||||
choices: LOG_CONTEXT_TYPES,
|
||||
coerce,
|
||||
},
|
||||
|
||||
text: {
|
||||
describe:
|
||||
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue