2021-04-10 13:08:22 -07:00
# ! / u s r / b i n / e n v n o d e
2022-11-07 07:57:22 -08:00
import fs from "fs" ;
import path from "path" ;
2023-11-09 11:27:11 -08:00
import http , { IncomingMessage , ServerResponse } from "http" ;
2021-04-10 13:08:22 -07:00
2022-11-07 07:57:22 -08:00
import readline from "readline" ;
import child_process from "child_process" ;
2021-04-10 13:08:22 -07:00
2023-11-09 11:27:11 -08:00
import yargs , { Options } from "yargs" ;
2021-07-20 15:45:51 -07:00
2023-03-17 14:24:44 -07:00
import { logger } from "./util/logger.js" ;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
import { Browser } from "./util/browser.js" ;
2022-11-07 07:57:22 -08:00
import { initStorage } from "./util/storage.js" ;
2023-11-09 11:27:11 -08:00
import { CDPSession , Page , PuppeteerLifeCycleEvent } from "puppeteer-core" ;
2024-03-18 14:24:48 -07:00
import { getInfoString } from "./util/file_reader.js" ;
2024-06-25 13:53:43 -07:00
import { DISPLAY } from "./util/constants.js" ;
2021-07-20 15:45:51 -07:00
2023-11-09 19:11:11 -05:00
const profileHTML = fs . readFileSync (
new URL ( "../html/createProfile.html" , import . meta . url ) ,
{ encoding : "utf8" } ,
) ;
const vncHTML = fs . readFileSync (
new URL ( "../html/vnc_lite.html" , import . meta . url ) ,
{ encoding : "utf8" } ,
) ;
const behaviors = fs . readFileSync (
new URL (
"../node_modules/browsertrix-behaviors/dist/behaviors.js" ,
import . meta . url ,
) ,
{ encoding : "utf8" } ,
) ;
function cliOpts ( ) : { [ key : string ] : Options } {
2021-04-10 13:08:22 -07:00
return {
2023-11-09 19:11:11 -05:00
url : {
2021-04-10 13:08:22 -07:00
describe : "The URL of the login page" ,
type : "string" ,
demandOption : true ,
} ,
2023-11-09 19:11:11 -05:00
user : {
describe :
"The username for the login. If not specified, will be prompted" ,
2021-04-10 13:08:22 -07:00
} ,
2023-11-09 19:11:11 -05:00
password : {
describe :
"The password for the login. If not specified, will be prompted (recommended)" ,
2021-04-10 13:08:22 -07:00
} ,
2023-11-09 19:11:11 -05:00
filename : {
2024-03-29 16:46:54 -04:00
describe :
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided" ,
2022-09-28 15:49:52 -04:00
default : "/crawls/profiles/profile.tar.gz" ,
2021-04-10 13:08:22 -07:00
} ,
2023-11-09 19:11:11 -05:00
debugScreenshot : {
describe :
"If specified, take a screenshot after login and save as this filename" ,
2021-04-10 13:08:22 -07:00
} ,
2023-11-09 19:11:11 -05:00
headless : {
2021-04-10 13:08:22 -07:00
describe : "Run in headless mode, otherwise start xvfb" ,
type : "boolean" ,
default : false ,
} ,
2021-07-20 15:45:51 -07:00
2023-11-09 19:11:11 -05:00
automated : {
2023-01-09 23:56:53 -08:00
describe : "Start in automated mode, no interactive browser" ,
2021-07-20 15:45:51 -07:00
type : "boolean" ,
default : false ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
} ,
2023-11-09 19:11:11 -05:00
interactive : {
2023-01-09 23:56:53 -08:00
describe : "Deprecated. Now the default option!" ,
type : "boolean" ,
2023-11-09 19:11:11 -05:00
default : false ,
2023-01-09 23:56:53 -08:00
} ,
2023-11-09 19:11:11 -05:00
shutdownWait : {
describe :
"Shutdown browser in interactive after this many seconds, if no pings received" ,
2022-05-05 14:27:17 -05:00
type : "number" ,
2023-11-09 19:11:11 -05:00
default : 0 ,
2022-05-05 14:27:17 -05:00
} ,
2023-11-09 19:11:11 -05:00
profile : {
describe :
2024-07-20 03:53:28 +02:00
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory" ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
type : "string" ,
} ,
2023-11-09 19:11:11 -05:00
windowSize : {
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
type : "string" ,
describe : "Browser window dimensions, specified as: width,height" ,
2023-11-09 19:11:11 -05:00
default : getDefaultWindowSize ( ) ,
2022-03-18 10:32:59 -07:00
} ,
2024-06-10 13:11:00 -07:00
proxyServer : {
describe :
"if set, will use specified proxy server. Takes precedence over any env var proxy settings" ,
type : "string" ,
2022-05-18 23:23:32 -07:00
} ,
2023-11-09 19:11:11 -05:00
cookieDays : {
2022-05-18 23:23:32 -07:00
type : "number" ,
2023-11-09 19:11:11 -05:00
describe :
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile" ,
default : 7 ,
} ,
2021-04-10 13:08:22 -07:00
} ;
}
2023-01-09 23:56:53 -08:00
function getDefaultWindowSize() {
2023-11-09 11:27:11 -08:00
const values = ( process . env . GEOMETRY || "" ) . split ( "x" ) ;
2023-01-09 23:56:53 -08:00
const x = Number ( values [ 0 ] ) ;
const y = Number ( values [ 1 ] ) ;
return ` ${ x } , ${ y } ` ;
}
2024-03-18 14:24:48 -07:00
function handleTerminate ( signame : string ) {
logger . info ( ` Got signal ${ signame } , exiting ` ) ;
process . exit ( 1 ) ;
}
2021-04-10 13:08:22 -07:00
async function main() {
2023-11-09 11:27:11 -08:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
2023-11-09 19:11:11 -05:00
const params : any = yargs ( process . argv )
2021-04-10 13:08:22 -07:00
. usage ( "browsertrix-crawler profile [options]" )
2023-11-09 19:11:11 -05:00
. option ( cliOpts ( ) ) . argv ;
2021-04-10 13:08:22 -07:00
2023-03-17 14:24:44 -07:00
logger . setDebugLogging ( true ) ;
2024-03-18 14:24:48 -07:00
logger . info ( await getInfoString ( ) ) ;
process . on ( "SIGINT" , ( ) = > handleTerminate ( "SIGINT" ) ) ;
process . on ( "SIGTERM" , ( ) = > handleTerminate ( "SIGTERM" ) ) ;
2021-04-10 13:08:22 -07:00
if ( ! params . headless ) {
2023-03-17 14:24:44 -07:00
logger . debug ( "Launching XVFB" ) ;
2021-04-10 13:08:22 -07:00
child_process . spawn ( "Xvfb" , [
2024-06-25 13:53:43 -07:00
DISPLAY ,
2021-04-10 13:08:22 -07:00
"-listen" ,
"tcp" ,
"-screen" ,
"0" ,
2023-11-09 11:27:11 -08:00
process . env . GEOMETRY || "" ,
2021-04-10 13:08:22 -07:00
"-ac" ,
"+extension" ,
2023-11-09 19:11:11 -05:00
"RANDR" ,
2021-04-10 13:08:22 -07:00
] ) ;
2023-01-09 23:56:53 -08:00
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
//child_process.spawnSync("x11vnc", ["-storepasswd", process.env.VNC_PASS, path.join(homedir(), ".vnc", "passwd")]);
child_process . spawn ( "x11vnc" , [
"-forever" ,
"-ncache_cr" ,
"-xdamage" ,
"-usepw" ,
"-shared" ,
"-rfbport" ,
"6080" ,
"-passwd" ,
2023-11-09 11:27:11 -08:00
process . env . VNC_PASS || "" ,
2023-01-09 23:56:53 -08:00
"-display" ,
2024-06-25 13:53:43 -07:00
DISPLAY ,
2023-01-09 23:56:53 -08:00
] ) ;
2021-04-10 13:08:22 -07:00
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
const browser = new Browser ( ) ;
await browser . launch ( {
2023-04-26 15:41:35 -07:00
profileUrl : params.profile ,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
headless : params.headless ,
2024-03-18 14:24:48 -07:00
signals : false ,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
chromeOptions : {
2024-06-10 13:11:00 -07:00
proxy : params.proxyServer ,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
extraArgs : [
"--window-position=0,0" ,
` --window-size= ${ params . windowSize } ` ,
// to disable the 'stability will suffer' infobar
2023-11-09 19:11:11 -05:00
"--test-type" ,
] ,
} ,
2024-03-22 17:32:42 -07:00
recording : false ,
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
} ) ;
2021-04-10 13:08:22 -07:00
2023-01-09 23:56:53 -08:00
if ( params . interactive ) {
2023-11-09 19:11:11 -05:00
logger . warn (
"Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode" ,
) ;
2023-01-09 23:56:53 -08:00
}
if ( params . user || params . password ) {
params . automated = true ;
}
if ( ! params . user && params . automated ) {
2021-04-10 13:08:22 -07:00
params . user = await promptInput ( "Enter username: " ) ;
}
2023-01-09 23:56:53 -08:00
if ( ! params . password && params . automated ) {
2021-04-10 13:08:22 -07:00
params . password = await promptInput ( "Enter password: " , true ) ;
}
2023-04-26 15:41:35 -07:00
const { page , cdp } = await browser . newWindowPageWithCDP ( ) ;
2021-04-10 13:08:22 -07:00
2023-11-09 19:11:11 -05:00
const waitUntil : PuppeteerLifeCycleEvent = "load" ;
2021-04-10 13:08:22 -07:00
2023-04-26 15:41:35 -07:00
await page . setCacheEnabled ( false ) ;
2021-04-10 13:08:22 -07:00
2023-01-09 23:56:53 -08:00
if ( ! params . automated ) {
2023-11-09 19:11:11 -05:00
await browser . setupPage ( { page , cdp } ) ;
2023-04-24 10:26:56 -07:00
2022-02-20 22:22:19 -08:00
// for testing, inject browsertrix-behaviors
2023-11-09 19:11:11 -05:00
await browser . addInitScript (
page ,
behaviors + ";\nself.__bx_behaviors.init();" ,
) ;
2022-02-20 22:22:19 -08:00
}
2023-01-09 23:56:53 -08:00
if ( ! params . automated ) {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
const target = await cdp . send ( "Target.getTargetInfo" ) ;
const targetId = target . targetInfo . targetId ;
2024-05-02 17:55:22 +02:00
const ibrowser = new InteractiveBrowser (
params ,
browser ,
page ,
cdp ,
targetId ,
) ;
await ibrowser . startLoad ( waitUntil ) ;
2023-01-09 23:56:53 -08:00
} else {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await automatedProfile ( params , browser , page , cdp , waitUntil ) ;
2021-07-20 15:45:51 -07:00
}
2023-01-09 23:56:53 -08:00
}
2021-07-20 15:45:51 -07:00
2023-11-09 19:11:11 -05:00
async function automatedProfile (
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params : any ,
browser : Browser ,
page : Page ,
cdp : CDPSession ,
waitUntil : PuppeteerLifeCycleEvent ,
) {
2021-04-10 13:08:22 -07:00
let u , p ;
2024-03-28 10:21:31 -07:00
logger . info ( ` Loading page: ${ params . url } ` ) ;
2024-04-25 09:34:57 +02:00
try {
await page . goto ( params . url , { waitUntil } ) ;
} catch ( e ) {
logger . error ( "Page Load Failed/Interrupted" , e ) ;
}
2024-03-28 10:21:31 -07:00
2023-03-17 14:24:44 -07:00
logger . debug ( "Looking for username and password entry fields on page..." ) ;
2023-01-09 23:56:53 -08:00
2021-04-10 13:08:22 -07:00
try {
2023-11-09 19:11:11 -05:00
u = await page . waitForSelector (
2024-07-11 18:55:06 -04:00
"input[name='user'],input[name='username'],input[name='email']" ,
2023-11-09 19:11:11 -05:00
) ;
p = await page . waitForSelector (
2024-07-11 18:55:06 -04:00
"input[type='password'].input[name='pass'],input[name='password']" ,
2023-11-09 19:11:11 -05:00
) ;
2021-04-10 13:08:22 -07:00
} catch ( e ) {
if ( params . debugScreenshot ) {
2023-11-09 19:11:11 -05:00
await page . screenshot ( { path : params.debugScreenshot } ) ;
2021-04-10 13:08:22 -07:00
}
2023-03-17 14:24:44 -07:00
logger . debug ( "Login form could not be found" ) ;
2021-04-10 13:08:22 -07:00
await page . close ( ) ;
process . exit ( 1 ) ;
return ;
}
2023-11-09 11:27:11 -08:00
await u ! . type ( params . user ) ;
2021-04-10 13:08:22 -07:00
2023-11-09 11:27:11 -08:00
await p ! . type ( params . password ) ;
2021-04-10 13:08:22 -07:00
await Promise . allSettled ( [
2023-11-09 11:27:11 -08:00
p ! . press ( "Enter" ) ,
2023-11-09 19:11:11 -05:00
page . waitForNavigation ( { waitUntil } ) ,
2021-04-10 13:08:22 -07:00
] ) ;
if ( params . debugScreenshot ) {
2023-11-09 19:11:11 -05:00
await page . screenshot ( { path : params.debugScreenshot } ) ;
2021-04-10 13:08:22 -07:00
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await createProfile ( params , browser , page , cdp ) ;
2021-07-20 15:45:51 -07:00
process . exit ( 0 ) ;
}
2023-11-09 19:11:11 -05:00
async function createProfile (
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params : any ,
browser : Browser ,
page : Page ,
cdp : CDPSession ,
targetFilename = "" ,
) {
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
await cdp . send ( "Network.clearBrowserCache" ) ;
2021-07-20 15:45:51 -07:00
2021-04-10 13:08:22 -07:00
await browser . close ( ) ;
2023-03-17 14:24:44 -07:00
logger . info ( "Creating profile" ) ;
2021-04-10 13:08:22 -07:00
2024-03-29 16:46:54 -04:00
if ( params . filename && ! params . filename . startsWith ( "/" ) ) {
params . filename = path . resolve ( "/crawls/profiles/" , params . filename ) ;
logger . info (
` Absolute path for filename not provided, saving to ${ params . filename } ` ,
) ;
}
2022-09-28 15:49:52 -04:00
const profileFilename = params . filename || "/crawls/profiles/profile.tar.gz" ;
2023-11-09 19:11:11 -05:00
2022-09-28 15:49:52 -04:00
const outputDir = path . dirname ( profileFilename ) ;
if ( outputDir && ! fs . existsSync ( outputDir ) ) {
2023-11-09 19:11:11 -05:00
fs . mkdirSync ( outputDir , { recursive : true } ) ;
2022-09-28 15:49:52 -04:00
}
2021-04-10 13:08:22 -07:00
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
browser . saveProfile ( profileFilename ) ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
2022-05-05 14:27:17 -05:00
let resource = { } ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
const storage = initStorage ( ) ;
2022-05-05 14:27:17 -05:00
if ( storage ) {
2023-03-17 14:24:44 -07:00
logger . info ( "Uploading to remote storage..." ) ;
2022-05-05 14:27:17 -05:00
resource = await storage . uploadFile ( profileFilename , targetFilename ) ;
}
2023-03-17 14:24:44 -07:00
logger . info ( "Profile creation done" ) ;
2022-05-05 14:27:17 -05:00
return resource ;
2021-04-10 13:08:22 -07:00
}
2023-11-09 11:27:11 -08:00
function promptInput ( msg : string , hidden = false ) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
2023-11-09 19:11:11 -05:00
const rl : any = readline . createInterface ( {
2021-04-10 13:08:22 -07:00
input : process.stdin ,
2023-11-09 19:11:11 -05:00
output : process.stdout ,
2021-04-10 13:08:22 -07:00
} ) ;
if ( hidden ) {
// from https://stackoverflow.com/a/59727173
rl . input . on ( "keypress" , function ( ) {
// get the number of characters entered so far:
const len = rl . line . length ;
// move cursor back to the beginning of the input:
readline . moveCursor ( rl . output , - len , 0 ) ;
// clear everything to the right of the cursor:
readline . clearLine ( rl . output , 1 ) ;
// replace the original input with asterisks:
for ( let i = 0 ; i < len ; i ++ ) {
rl . output . write ( "*" ) ;
}
} ) ;
}
2023-11-09 11:27:11 -08:00
return new Promise < string > ( ( resolve ) = > {
rl . question ( msg , function ( res : string ) {
2021-04-10 13:08:22 -07:00
rl . close ( ) ;
resolve ( res ) ;
} ) ;
} ) ;
}
2022-05-05 14:27:17 -05:00
class InteractiveBrowser {
2023-11-09 11:27:11 -08:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params : any ;
browser : Browser ;
page : Page ;
cdp : CDPSession ;
targetId : string ;
originSet = new Set < string > ( ) ;
shutdownWait : number ;
shutdownTimer : NodeJS.Timer | null ;
constructor (
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params : any ,
browser : Browser ,
page : Page ,
cdp : CDPSession ,
2023-11-09 19:11:11 -05:00
targetId : string ,
2023-11-09 11:27:11 -08:00
) {
2023-03-17 14:24:44 -07:00
logger . info ( "Creating Profile Interactively..." ) ;
2024-06-20 20:10:25 -07:00
if ( params . headless ) {
child_process . spawn ( "socat" , [
"tcp-listen:9222,reuseaddr,fork" ,
"tcp:localhost:9221" ,
] ) ;
}
2022-05-05 14:27:17 -05:00
this . params = params ;
this . browser = browser ;
this . page = page ;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
this . cdp = cdp ;
2022-05-05 14:27:17 -05:00
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
this . targetId = targetId ;
2022-05-05 14:27:17 -05:00
this . addOrigin ( ) ;
2021-07-20 15:45:51 -07:00
2022-05-18 23:23:32 -07:00
page . on ( "load" , ( ) = > this . handlePageLoad ( ) ) ;
2022-05-05 14:27:17 -05:00
2023-05-22 16:24:39 -07:00
// attempt to keep everything to initial tab if headless
if ( this . params . headless ) {
cdp . send ( "Page.enable" ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
2023-05-22 16:24:39 -07:00
cdp . on ( "Page.windowOpen" , async ( resp ) = > {
2024-04-25 09:34:57 +02:00
if ( ! resp . url ) {
return ;
}
try {
2023-11-09 11:27:11 -08:00
await cdp . send ( "Target.activateTarget" , { targetId : this.targetId } ) ;
2023-05-22 16:24:39 -07:00
await page . goto ( resp . url ) ;
2024-04-25 09:34:57 +02:00
} catch ( e ) {
logger . error ( "Page Load Failed/Interrupted" , e ) ;
2023-05-22 16:24:39 -07:00
}
} ) ;
}
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
2022-05-05 14:27:17 -05:00
this . shutdownWait = params . shutdownWait * 1000 ;
2023-11-09 11:27:11 -08:00
2022-05-05 14:27:17 -05:00
if ( this . shutdownWait ) {
this . shutdownTimer = setTimeout ( ( ) = > process . exit ( 0 ) , this . shutdownWait ) ;
2023-11-09 11:27:11 -08:00
logger . debug (
2023-11-09 19:11:11 -05:00
` Shutting down in ${ this . shutdownWait } ms if no ping received ` ,
2023-11-09 11:27:11 -08:00
) ;
2022-05-05 14:27:17 -05:00
} else {
2023-11-09 11:27:11 -08:00
this . shutdownTimer = null ;
2022-05-05 14:27:17 -05:00
}
2023-11-09 11:27:11 -08:00
const httpServer = http . createServer ( ( req , res ) = >
2023-11-09 19:11:11 -05:00
this . handleRequest ( req , res ) ,
2023-11-09 11:27:11 -08:00
) ;
2022-05-05 14:27:17 -05:00
const port = 9223 ;
httpServer . listen ( port ) ;
2023-11-09 11:27:11 -08:00
logger . info (
2023-11-09 19:11:11 -05:00
` Browser Profile UI Server started. Load http://localhost: ${ port } / to interact with a Chromium-based browser, click 'Create Profile' when done. ` ,
2023-11-09 11:27:11 -08:00
) ;
2023-01-09 23:56:53 -08:00
if ( ! params . headless ) {
2023-03-17 14:24:44 -07:00
logger . info ( "Screencasting with VNC on port 6080" ) ;
2023-01-09 23:56:53 -08:00
} else {
2023-03-17 14:24:44 -07:00
logger . info ( "Screencasting with CDP on port 9222" ) ;
2023-01-09 23:56:53 -08:00
}
2024-05-02 17:55:22 +02:00
}
2024-03-28 10:21:31 -07:00
2024-05-02 17:55:22 +02:00
async startLoad ( waitUntil : PuppeteerLifeCycleEvent = "load" ) {
logger . info ( ` Loading page: ${ this . params . url } ` ) ;
2024-03-28 10:21:31 -07:00
2024-05-02 17:55:22 +02:00
try {
await this . page . goto ( this . params . url , { waitUntil , timeout : 0 } ) ;
2024-03-28 10:21:31 -07:00
logger . info ( "Loaded!" ) ;
2024-05-02 17:55:22 +02:00
} catch ( e ) {
logger . warn ( "Page Load Failed/Interrupted" , e ) ;
}
2022-05-05 14:27:17 -05:00
}
2022-05-18 23:23:32 -07:00
handlePageLoad() {
this . addOrigin ( ) ;
this . saveCookiesFor ( this . page . url ( ) ) ;
}
async saveAllCookies() {
2023-03-17 14:24:44 -07:00
logger . info ( "Saving all cookies" ) ;
2022-05-18 23:23:32 -07:00
for ( const origin of this . originSet . values ( ) ) {
await this . saveCookiesFor ( origin + "/" ) ;
}
}
2023-11-09 11:27:11 -08:00
async saveCookiesFor ( url : string ) {
2022-05-18 23:23:32 -07:00
try {
if ( this . params . cookieDays <= 0 ) {
return ;
}
2023-11-09 11:27:11 -08:00
const cookies = await this . browser . getCookies ( this . page ) ;
for ( const cookieOrig of cookies ) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const cookie = cookieOrig as any ;
cookie . expires =
new Date ( ) . getTime ( ) / 1000 + this . params . cookieDays * 86400 ;
2022-05-18 23:23:32 -07:00
delete cookie . size ;
delete cookie . session ;
2023-11-09 11:27:11 -08:00
if (
cookie . sameSite &&
cookie . sameSite !== "Lax" &&
cookie . sameSite !== "Strict"
) {
2022-05-18 23:23:32 -07:00
delete cookie . sameSite ;
}
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253)
* Migrate from Puppeteer to Playwright!
- use playwright persistent browser context to support profiles
- move on-new-page setup actions to worker
- fix screencaster, init only one per page object, associate with worker-id
- fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage
- port additional chromium setup options
- create / detach cdp per page for each new page, screencaster just uses existing cdp
- fix evaluateWithCLI to call CDP command directly
- workers directly during WorkerPool - await not necessary
* State / Worker Refactor (#252)
* refactoring state:
- use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState
- remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster
- switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150)
- override console.error to avoid logging ioredis errors (fixes #244)
- add MAX_DEPTH as const for extraHops
- fix immediate exit on second interrupt
* worker/state refactor:
- remove job object from puppeteer-cluster
- rename shift() -> nextFromQueue()
- condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc...
- screencaster: don't screencast about:blank pages
* more worker queue refactor:
- remove p-queue
- initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages
- add setupPage(), teardownPage() to crawler, called from worker
- await runWorkers() promise which runs all workers until completion
- remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code)
- bump to 0.9.0-beta.1
* use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition)
* more fixes for playwright:
- fix profile creation
- browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout
- crawler: various fixes, including for html check
- logging: addition logging for screencaster, new window, etc...
- remove unused packages
---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
if ( ! cookie . domain && ! cookie . path ) {
cookie . url = url ;
}
2022-05-18 23:23:32 -07:00
}
2023-04-26 15:41:35 -07:00
await this . browser . setCookies ( this . page , cookies ) ;
2023-11-09 11:27:11 -08:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch ( e : any ) {
2023-03-17 14:24:44 -07:00
logger . error ( "Save Cookie Error: " , e ) ;
2022-05-18 23:23:32 -07:00
}
}
2022-05-05 14:27:17 -05:00
addOrigin() {
const url = this . page . url ( ) ;
2023-11-09 11:27:11 -08:00
logger . debug ( "Adding origin" , { url } ) ;
2022-05-05 14:27:17 -05:00
if ( url . startsWith ( "http:" ) || url . startsWith ( "https:" ) ) {
this . originSet . add ( new URL ( url ) . origin ) ;
}
}
2023-11-09 11:27:11 -08:00
async handleRequest ( req : IncomingMessage , res : ServerResponse ) {
const parsedUrl = new URL ( req . url || "" , ` http:// ${ req . headers . host } ` ) ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
const pathname = parsedUrl . pathname ;
2022-05-05 14:27:17 -05:00
let targetUrl ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
let origins ;
2022-05-05 14:27:17 -05:00
switch ( pathname ) {
2023-11-09 19:11:11 -05:00
case "/" :
res . writeHead ( 200 , { "Content-Type" : "text/html" } ) ;
if ( this . params . headless ) {
targetUrl = ` http:// $ HOST:9222/devtools/inspector.html?ws= $ HOST:9222/devtools/page/ ${ this . targetId } &panel=resources ` ;
} else {
targetUrl = ` http:// $ HOST:9223/vnc/?host= $ HOST&port=6080&password= ${ process . env . VNC_PASS } ` ;
}
res . end (
profileHTML . replace (
"$DEVTOOLS_SRC" ,
targetUrl . replaceAll ( "$HOST" , parsedUrl . hostname ) ,
) ,
2023-11-09 11:27:11 -08:00
) ;
2023-11-09 19:11:11 -05:00
return ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
2023-11-09 19:11:11 -05:00
case "/vnc/" :
case "/vnc/index.html" :
res . writeHead ( 200 , { "Content-Type" : "text/html" } ) ;
res . end ( vncHTML ) ;
return ;
2022-05-05 14:27:17 -05:00
2023-11-09 19:11:11 -05:00
case "/ping" :
if ( this . shutdownWait ) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
clearTimeout ( this . shutdownTimer as any ) ;
this . shutdownTimer = setTimeout (
( ) = > process . exit ( 0 ) ,
this . shutdownWait ,
) ;
logger . debug (
` Ping received, delaying shutdown for ${ this . shutdownWait } ms ` ,
) ;
}
2022-05-05 14:27:17 -05:00
2023-11-09 19:11:11 -05:00
origins = Array . from ( this . originSet . values ( ) ) ;
2023-01-09 23:56:53 -08:00
2023-11-09 19:11:11 -05:00
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
2022-05-05 14:27:17 -05:00
2023-11-09 19:11:11 -05:00
res . end ( JSON . stringify ( { pong : true , origins } ) ) ;
return ;
2022-05-05 14:27:17 -05:00
2023-11-09 19:11:11 -05:00
case "/target" :
2023-11-09 11:27:11 -08:00
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
2023-11-09 19:11:11 -05:00
res . end ( JSON . stringify ( { targetId : this.targetId } ) ) ;
return ;
2022-05-05 14:27:17 -05:00
2023-11-09 19:11:11 -05:00
case "/vncpass" :
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { password : process.env.VNC_PASS } ) ) ;
return ;
2022-05-05 14:27:17 -05:00
2023-11-09 19:11:11 -05:00
case "/navigate" :
if ( req . method !== "POST" ) {
break ;
}
2022-05-18 23:23:32 -07:00
2023-11-09 19:11:11 -05:00
try {
const postData = await this . readBodyJson ( req ) ;
const url = new URL ( postData . url ) . href ;
2022-05-18 23:23:32 -07:00
2023-11-09 19:11:11 -05:00
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { success : true } ) ) ;
2022-05-05 14:27:17 -05:00
2024-04-25 09:34:57 +02:00
logger . info ( "Loading Page" , { page : url } ) ;
this . page
. goto ( url )
. catch ( ( e ) = > logger . warn ( "Page Load Failed/Interrupted" , e ) ) ;
2023-11-09 19:11:11 -05:00
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch ( e : any ) {
res . writeHead ( 400 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { error : e.toString ( ) } ) ) ;
logger . warn ( "HTTP Error" , e ) ;
}
return ;
2021-07-20 15:45:51 -07:00
2023-11-09 19:11:11 -05:00
case "/createProfileJS" :
if ( req . method !== "POST" ) {
break ;
}
2022-05-05 14:27:17 -05:00
2023-11-09 19:11:11 -05:00
try {
const postData = await this . readBodyJson ( req ) ;
const targetFilename = postData . filename || "" ;
await this . saveAllCookies ( ) ;
const resource = await createProfile (
this . params ,
this . browser ,
this . page ,
this . cdp ,
targetFilename ,
) ;
origins = Array . from ( this . originSet . values ( ) ) ;
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { resource , origins } ) ) ;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch ( e : any ) {
res . writeHead ( 500 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { error : e.toString ( ) } ) ) ;
logger . warn ( "HTTP Error" , e ) ;
}
2021-07-20 15:45:51 -07:00
2023-11-09 19:11:11 -05:00
setTimeout ( ( ) = > process . exit ( 0 ) , 200 ) ;
return ;
2022-05-18 23:23:32 -07:00
2023-11-09 19:11:11 -05:00
case "/createProfile" :
if ( req . method !== "POST" ) {
break ;
}
2021-07-20 15:45:51 -07:00
2023-11-09 19:11:11 -05:00
try {
await this . saveAllCookies ( ) ;
await createProfile ( this . params , this . browser , this . page , this . cdp ) ;
res . writeHead ( 200 , { "Content-Type" : "text/html" } ) ;
res . end (
"<html><body>Profile Created! You may now close this window.</body></html>" ,
) ;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch ( e : any ) {
res . writeHead ( 500 , { "Content-Type" : "text/html" } ) ;
res . end (
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info" ,
) ;
logger . warn ( "HTTP Error" , e ) ;
}
2021-07-20 15:45:51 -07:00
2023-11-09 19:11:11 -05:00
setTimeout ( ( ) = > process . exit ( 0 ) , 200 ) ;
return ;
2021-07-20 15:45:51 -07:00
}
2023-01-09 23:56:53 -08:00
if ( pathname . startsWith ( "/vnc/" ) ) {
2023-11-09 11:27:11 -08:00
const fileUrl = new URL (
"../node_modules/@novnc/novnc/" + pathname . slice ( "/vnc/" . length ) ,
2023-11-09 19:11:11 -05:00
import . meta . url ,
2023-11-09 11:27:11 -08:00
) ;
const file = fs . readFileSync ( fileUrl , { encoding : "utf-8" } ) ;
res . writeHead ( 200 , { "Content-Type" : "application/javascript" } ) ;
2023-01-09 23:56:53 -08:00
res . end ( file ) ;
return ;
}
2023-11-09 11:27:11 -08:00
res . writeHead ( 404 , { "Content-Type" : "text/html" } ) ;
2022-05-05 14:27:17 -05:00
res . end ( "Not Found" ) ;
}
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
2023-11-09 11:27:11 -08:00
async readBodyJson ( req : IncomingMessage ) {
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
const buffers = [ ] ;
for await ( const chunk of req ) {
buffers . push ( chunk ) ;
}
const data = Buffer . concat ( buffers ) . toString ( ) ;
if ( data . length ) {
try {
return JSON . parse ( data ) || { } ;
} catch ( e ) {
return { } ;
}
}
}
2021-07-20 15:45:51 -07:00
}
2021-04-10 13:08:22 -07:00
main ( ) ;