| 
									
										
										
										
											2022-10-24 15:30:10 +02:00
										 |  |  | import fs from "fs"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import util from "util"; | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  | import { exec as execCallback } from "child_process"; | 
					
						
							| 
									
										
										
										
											2022-10-24 15:30:10 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | const exec = util.promisify(execCallback); | 
					
						
							| 
									
										
										
										
											2022-01-15 09:03:09 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
											
										 
											2023-03-17 12:50:32 -07:00
										 |  |  | const extraHopsTimeout = 180000; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  | test( | 
					
						
							|  |  |  |   "check that URLs are crawled 2 extra hops beyond depth", | 
					
						
							|  |  |  |   async () => { | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       await exec( | 
					
						
							|  |  |  |         "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7", | 
					
						
							|  |  |  |       ); | 
					
						
							|  |  |  |     } catch (error) { | 
					
						
							|  |  |  |       console.log(error); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2022-01-15 09:03:09 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |     const crawledPages = fs.readFileSync( | 
					
						
							|  |  |  |       "test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", | 
					
						
							|  |  |  |       "utf8", | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  |     const crawledPagesArray = crawledPages.trim().split("\n"); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-11 13:55:52 -07:00
										 |  |  |     const crawledExtraPages = fs.readFileSync( | 
					
						
							|  |  |  |       "test-crawls/collections/extra-hops-beyond/pages/extraPages.jsonl", | 
					
						
							|  |  |  |       "utf8", | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  |     const crawledExtraPagesArray = crawledExtraPages.trim().split("\n"); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |     const expectedPages = [ | 
					
						
							|  |  |  |       "https://webrecorder.net/", | 
					
						
							| 
									
										
										
										
											2024-04-11 13:55:52 -07:00
										 |  |  |     ]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const expectedExtraPages = [ | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |       "https://webrecorder.net/blog", | 
					
						
							|  |  |  |       "https://webrecorder.net/tools", | 
					
						
							|  |  |  |       "https://webrecorder.net/community", | 
					
						
							|  |  |  |       "https://webrecorder.net/about", | 
					
						
							|  |  |  |       "https://webrecorder.net/contact", | 
					
						
							|  |  |  |       "https://webrecorder.net/faq", | 
					
						
							|  |  |  |     ]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // first line is the header, not page, so adding -1
 | 
					
						
							|  |  |  |     expect(crawledPagesArray.length - 1).toEqual(expectedPages.length); | 
					
						
							| 
									
										
										
										
											2024-04-11 13:55:52 -07:00
										 |  |  |     expect(crawledExtraPagesArray.length - 1).toEqual(expectedExtraPages.length); | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for (const page of crawledPagesArray) { | 
					
						
							|  |  |  |       const url = JSON.parse(page).url; | 
					
						
							|  |  |  |       if (!url) { | 
					
						
							|  |  |  |         continue; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |       expect(expectedPages.indexOf(url) >= 0).toBe(true); | 
					
						
							| 
									
										
										
										
											2022-01-15 09:03:09 -08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-11 13:55:52 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for (const page of crawledExtraPagesArray) { | 
					
						
							|  |  |  |       const url = JSON.parse(page).url; | 
					
						
							|  |  |  |       if (!url) { | 
					
						
							|  |  |  |         continue; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |       expect(expectedExtraPages.indexOf(url) >= 0).toBe(true); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 |  |  |   }, | 
					
						
							|  |  |  |   extraHopsTimeout, | 
					
						
							|  |  |  | ); |