2021-04-10 13:08:22 -07:00
#!/usr/bin/env node
const readline = require ( "readline" ) ;
const child _process = require ( "child_process" ) ;
const puppeteer = require ( "puppeteer-core" ) ;
const yargs = require ( "yargs" ) ;
2022-03-18 10:32:59 -07:00
const { getBrowserExe , loadProfile , saveProfile , chromeArgs , sleep } = require ( "./util/browser" ) ;
2021-07-20 15:45:51 -07:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const http = require ( "http" ) ;
2022-02-23 12:09:48 -08:00
const profileHTML = fs . readFileSync ( path . join ( _ _dirname , "html" , "createProfile.html" ) , { encoding : "utf8" } ) ;
2021-07-20 15:45:51 -07:00
2022-02-20 22:22:19 -08:00
const behaviors = fs . readFileSync ( path . join ( _ _dirname , "node_modules" , "browsertrix-behaviors" , "dist" , "behaviors.js" ) , { encoding : "utf8" } ) ;
2021-04-10 13:08:22 -07:00
function cliOpts ( ) {
return {
"url" : {
describe : "The URL of the login page" ,
type : "string" ,
demandOption : true ,
} ,
"user" : {
describe : "The username for the login. If not specified, will be prompted" ,
} ,
"password" : {
describe : "The password for the login. If not specified, will be prompted (recommended)" ,
} ,
"filename" : {
describe : "The filename for the profile tarball" ,
default : "/output/profile.tar.gz" ,
} ,
"debugScreenshot" : {
describe : "If specified, take a screenshot after login and save as this filename"
} ,
"headless" : {
describe : "Run in headless mode, otherwise start xvfb" ,
type : "boolean" ,
default : false ,
} ,
2021-07-20 15:45:51 -07:00
"interactive" : {
describe : "Start in interactive mode!" ,
type : "boolean" ,
default : false ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
} ,
"profile" : {
describe : "Path to tar.gz file which will be extracted and used as the browser profile" ,
type : "string" ,
} ,
"windowSize" : {
type : "string" ,
describe : "Browser window dimensions, specified as: width,height" ,
default : "1600,900"
2022-03-18 10:32:59 -07:00
} ,
"proxy" : {
type : "boolean" ,
default : false
2021-07-20 15:45:51 -07:00
}
2021-04-10 13:08:22 -07:00
} ;
}
async function main ( ) {
const params = yargs
. usage ( "browsertrix-crawler profile [options]" )
. option ( cliOpts ( ) )
. argv ;
if ( ! params . headless ) {
console . log ( "Launching XVFB" ) ;
child _process . spawn ( "Xvfb" , [
process . env . DISPLAY ,
"-listen" ,
"tcp" ,
"-screen" ,
"0" ,
process . env . GEOMETRY ,
"-ac" ,
"+extension" ,
"RANDR"
] ) ;
}
2022-03-18 10:32:59 -07:00
let useProxy = false ;
if ( params . proxy ) {
child _process . spawn ( "wayback" , [ "--live" , "--proxy" , "live" ] , { stdio : "inherit" , cwd : "/tmp" } ) ;
console . log ( "Running with pywb proxy" ) ;
await sleep ( 3000 ) ;
useProxy = true ;
}
const browserArgs = chromeArgs ( useProxy , null , [
"--remote-debugging-port=9221" ,
` --window-size= ${ params . windowSize } ` ,
] ) ;
2021-04-10 13:08:22 -07:00
//await new Promise(resolve => setTimeout(resolve, 2000));
2022-03-15 02:40:06 +00:00
const profileDir = await loadProfile ( params . profile ) ;
2021-04-10 13:08:22 -07:00
const args = {
headless : ! ! params . headless ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
executablePath : getBrowserExe ( ) ,
2021-04-10 13:08:22 -07:00
ignoreHTTPSErrors : true ,
2022-03-18 10:32:59 -07:00
args : browserArgs ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
userDataDir : profileDir ,
defaultViewport : null ,
2021-04-10 13:08:22 -07:00
} ;
2021-07-20 15:45:51 -07:00
if ( ! params . user && ! params . interactive ) {
2021-04-10 13:08:22 -07:00
params . user = await promptInput ( "Enter username: " ) ;
}
2021-07-20 15:45:51 -07:00
if ( ! params . password && ! params . interactive ) {
2021-04-10 13:08:22 -07:00
params . password = await promptInput ( "Enter password: " , true ) ;
}
const browser = await puppeteer . launch ( args ) ;
const page = await browser . newPage ( ) ;
const waitUntil = [ "load" , "networkidle2" ] ;
await page . setCacheEnabled ( false ) ;
2022-02-20 22:22:19 -08:00
if ( params . interactive ) {
2022-03-18 10:32:59 -07:00
await page . evaluateOnNewDocument ( "Object.defineProperty(navigator, \"webdriver\", {value: false});" ) ;
2022-02-20 22:22:19 -08:00
// for testing, inject browsertrix-behaviors
await page . evaluateOnNewDocument ( behaviors + ";\nself.__bx_behaviors.init();" ) ;
}
2021-04-10 13:08:22 -07:00
console . log ( "loading" ) ;
await page . goto ( params . url , { waitUntil } ) ;
console . log ( "loaded" ) ;
2021-07-20 15:45:51 -07:00
if ( params . interactive ) {
await handleInteractive ( params , browser , page ) ;
return ;
}
2021-04-10 13:08:22 -07:00
let u , p ;
try {
2021-04-30 12:31:14 -07:00
u = await page . waitForXPath ( "//input[contains(@name, 'user') or contains(@name, 'email')]" ) ;
2021-04-10 13:08:22 -07:00
p = await page . waitForXPath ( "//input[contains(@name, 'pass') and @type='password']" ) ;
} catch ( e ) {
if ( params . debugScreenshot ) {
await page . screenshot ( { path : params . debugScreenshot } ) ;
}
console . log ( "Login form could not be found" ) ;
await page . close ( ) ;
process . exit ( 1 ) ;
return ;
}
await u . type ( params . user ) ;
await p . type ( params . password ) ;
await Promise . allSettled ( [
p . press ( "Enter" ) ,
page . waitForNavigation ( { waitUntil } )
] ) ;
if ( params . debugScreenshot ) {
await page . screenshot ( { path : params . debugScreenshot } ) ;
}
2021-07-20 15:45:51 -07:00
await createProfile ( params , browser , page ) ;
process . exit ( 0 ) ;
}
async function createProfile ( params , browser , page ) {
await page . _client . send ( "Network.clearBrowserCache" ) ;
2021-04-10 13:08:22 -07:00
await browser . close ( ) ;
console . log ( "creating profile" ) ;
const profileFilename = params . filename || "/output/profile.tar.gz" ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
saveProfile ( profileFilename ) ;
2021-04-10 13:08:22 -07:00
console . log ( "done" ) ;
}
function promptInput ( msg , hidden = false ) {
const rl = readline . createInterface ( {
input : process . stdin ,
output : process . stdout
} ) ;
if ( hidden ) {
// from https://stackoverflow.com/a/59727173
rl . input . on ( "keypress" , function ( ) {
// get the number of characters entered so far:
const len = rl . line . length ;
// move cursor back to the beginning of the input:
readline . moveCursor ( rl . output , - len , 0 ) ;
// clear everything to the right of the cursor:
readline . clearLine ( rl . output , 1 ) ;
// replace the original input with asterisks:
for ( let i = 0 ; i < len ; i ++ ) {
rl . output . write ( "*" ) ;
}
} ) ;
}
return new Promise ( ( resolve ) => {
rl . question ( msg , function ( res ) {
rl . close ( ) ;
resolve ( res ) ;
} ) ;
} ) ;
}
2021-07-20 15:45:51 -07:00
async function handleInteractive ( params , browser , page ) {
const target = page . target ( ) ;
2022-03-18 10:32:59 -07:00
const targetUrl = ` http:// $ HOST:9222/devtools/inspector.html?ws= $ HOST:9222/devtools/page/ ${ target . _targetId } &panel=resources ` ;
2021-07-20 15:45:51 -07:00
console . log ( "Creating Profile Interactively..." ) ;
child _process . spawn ( "socat" , [ "tcp-listen:9222,fork" , "tcp:localhost:9221" ] ) ;
const httpServer = http . createServer ( async ( req , res ) => {
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
const parsedUrl = new URL ( req . url , ` http:// ${ req . headers . host } ` ) ;
const pathname = parsedUrl . pathname ;
2021-07-20 15:45:51 -07:00
if ( pathname === "/" ) {
res . writeHead ( 200 , { "Content-Type" : "text/html" } ) ;
2022-03-18 10:32:59 -07:00
res . end ( profileHTML . replace ( "$DEVTOOLS_SRC" , targetUrl . replaceAll ( "$HOST" , parsedUrl . hostname ) ) ) ;
2021-07-20 15:45:51 -07:00
} else if ( pathname === "/createProfile" && req . method === "POST" ) {
try {
await createProfile ( params , browser , page ) ;
res . writeHead ( 200 , { "Content-Type" : "text/html" } ) ;
res . end ( "<html><body>Profile Created! You may now close this window.</body></html>" ) ;
} catch ( e ) {
res . writeHead ( 500 , { "Content-Type" : "text/html" } ) ;
res . end ( "<html><body>Profile creation failed! See the browsertrix-crawler console for more info" ) ;
console . log ( e ) ;
}
setTimeout ( ( ) => process . exit ( 0 ) , 200 ) ;
} else {
res . writeHead ( 404 , { "Content-Type" : "text/html" } ) ;
res . end ( "Not Found" ) ;
}
} ) ;
const port = 9223 ;
httpServer . listen ( port ) ;
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
console . log ( ` Browser Profile UI Server started. Load http://localhost: ${ port } / to interact with a Chromium-based browser, click 'Create Profile' when done. ` ) ;
2021-07-20 15:45:51 -07:00
}
2021-04-10 13:08:22 -07:00
main ( ) ;