mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Catch loading issues (#255)
* various loading improvements to avoid pages getting 'stuck' + load state tracking - add PageState object, store loadstate (0 to 4) as well as other per-page-state properties on defined object. - set loadState to 0 (failed) by default - set loadState to 1 (content-loaded) on 'domcontentloaded' event - if page.goto() finishes, set to loadState to 2 'full-page-load'. - if page.goto() times out, if no domcontentloaded either, fail immediately. if domcontentloaded reached, extract links, but don't run behaviors - page considered 'finished' if it got to at least loadState 2 'full-pageload', even if behaviors timed out - pages: log 'loadState' as part of pages.jsonl - improve frame detection: detect if frame actually not from a frame tag (eg. OBJECT) tag, and skip as well - screencaster: try screencasting every frame for now instead of every other frame, for smoother screencasting - deps: behaviors: bump to browsertrix-behaviors 0.5.0-beta.0 release (includes autoscroll improvements) - workers ids: just use 0, 1, ... n-1 worker indexes, send numeric index as part of screencast messages - worker: only keeps track of crash state to recreate page, decouple crash and page failed/succeeded state - screencaster: allow reusing caster slots with fixed ids - interrupt timedCrawlPage() wait if 'crash' event happens - crawler: pageFinished() callback when page finishes - worker: add workerIdle callback, call screencaster.stopById() and send 'close' message when worker is empty
This commit is contained in:
parent
07e503a8e6
commit
02fb137b2c
8 changed files with 190 additions and 105 deletions
|
@ -1079,10 +1079,10 @@ browserslist@^4.21.3:
|
|||
node-releases "^2.0.6"
|
||||
update-browserslist-db "^1.0.9"
|
||||
|
||||
browsertrix-behaviors@^0.4.2:
|
||||
version "0.4.2"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.2.tgz#830f4e37ddebf10dd923237dfd75c734d5d211e9"
|
||||
integrity sha512-5w6kPL3NB/BkmEGxWt3NT3iddAaSzMR1TtDPS7b66fM9kkhpCjCv/R/zR951jWDIeV3flJFBOy09uI5o8asPqg==
|
||||
browsertrix-behaviors@^0.5.0-beta.0:
|
||||
version "0.5.0-beta.0"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.0-beta.0.tgz#d1a7c35cda31d740a374df1e833f36bd1890768d"
|
||||
integrity sha512-RQMQlbV4OBAzYyhTI7imoem8p4MTj2XSDzlIZvA5sC5U89OMnJ0VM5KBAJzET3PUJkQlUQEOTiXtnsnodHXTUQ==
|
||||
|
||||
bser@2.1.1:
|
||||
version "2.1.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue