2021-04-10 13:08:22 -07:00
#!/usr/bin/env node
2022-11-07 07:57:22 -08:00
import fs from "fs" ;
import path from "path" ;
import http from "http" ;
2021-04-10 13:08:22 -07:00
2022-11-07 07:57:22 -08:00
import readline from "readline" ;
import child _process from "child_process" ;
2021-04-10 13:08:22 -07:00
2022-11-07 07:57:22 -08:00
import puppeteer from "puppeteer-core" ;
import yargs from "yargs" ;
2021-07-20 15:45:51 -07:00
2022-11-07 07:57:22 -08:00
import { getBrowserExe , loadProfile , saveProfile , chromeArgs , sleep } from "./util/browser.js" ;
import { initStorage } from "./util/storage.js" ;
2021-07-20 15:45:51 -07:00
2022-11-07 07:57:22 -08:00
const profileHTML = fs . readFileSync ( new URL ( "html/createProfile.html" , import . meta . url ) , { encoding : "utf8" } ) ;
const behaviors = fs . readFileSync ( new URL ( "./node_modules/browsertrix-behaviors/dist/behaviors.js" , import . meta . url ) , { encoding : "utf8" } ) ;
2022-02-20 22:22:19 -08:00
2021-04-10 13:08:22 -07:00
function cliOpts ( ) {
return {
"url" : {
describe : "The URL of the login page" ,
type : "string" ,
demandOption : true ,
} ,
"user" : {
describe : "The username for the login. If not specified, will be prompted" ,
} ,
"password" : {
describe : "The password for the login. If not specified, will be prompted (recommended)" ,
} ,
"filename" : {
describe : "The filename for the profile tarball" ,
2022-09-28 15:49:52 -04:00
default : "/crawls/profiles/profile.tar.gz" ,
2021-04-10 13:08:22 -07:00
} ,
"debugScreenshot" : {
describe : "If specified, take a screenshot after login and save as this filename"
} ,
"headless" : {
describe : "Run in headless mode, otherwise start xvfb" ,
type : "boolean" ,
default : false ,
} ,
2021-07-20 15:45:51 -07:00
"interactive" : {
describe : "Start in interactive mode!" ,
type : "boolean" ,
default : false ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
} ,
2022-05-05 14:27:17 -05:00
"shutdownWait" : {
describe : "Shutdown browser in interactive after this many seconds, if no pings received" ,
type : "number" ,
default : 0
} ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
"profile" : {
describe : "Path to tar.gz file which will be extracted and used as the browser profile" ,
type : "string" ,
} ,
"windowSize" : {
type : "string" ,
describe : "Browser window dimensions, specified as: width,height" ,
default : "1600,900"
2022-03-18 10:32:59 -07:00
} ,
"proxy" : {
type : "boolean" ,
default : false
2022-05-18 23:23:32 -07:00
} ,
"cookieDays" : {
type : "number" ,
describe : "If >0, set all cookies, including session cookies, to have this duration in days before saving profile" ,
default : 7
2021-07-20 15:45:51 -07:00
}
2021-04-10 13:08:22 -07:00
} ;
}
async function main ( ) {
2022-11-07 07:57:22 -08:00
const params = yargs ( process . argv )
2021-04-10 13:08:22 -07:00
. usage ( "browsertrix-crawler profile [options]" )
. option ( cliOpts ( ) )
. argv ;
if ( ! params . headless ) {
console . log ( "Launching XVFB" ) ;
child _process . spawn ( "Xvfb" , [
process . env . DISPLAY ,
"-listen" ,
"tcp" ,
"-screen" ,
"0" ,
process . env . GEOMETRY ,
"-ac" ,
"+extension" ,
"RANDR"
] ) ;
}
2022-03-18 10:32:59 -07:00
let useProxy = false ;
if ( params . proxy ) {
child _process . spawn ( "wayback" , [ "--live" , "--proxy" , "live" ] , { stdio : "inherit" , cwd : "/tmp" } ) ;
console . log ( "Running with pywb proxy" ) ;
await sleep ( 3000 ) ;
useProxy = true ;
}
const browserArgs = chromeArgs ( useProxy , null , [
` --window-size= ${ params . windowSize } ` ,
] ) ;
2021-04-10 13:08:22 -07:00
//await new Promise(resolve => setTimeout(resolve, 2000));
2022-03-15 02:40:06 +00:00
const profileDir = await loadProfile ( params . profile ) ;
2021-04-10 13:08:22 -07:00
const args = {
headless : ! ! params . headless ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
executablePath : getBrowserExe ( ) ,
2021-04-10 13:08:22 -07:00
ignoreHTTPSErrors : true ,
2022-03-18 10:32:59 -07:00
args : browserArgs ,
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
userDataDir : profileDir ,
defaultViewport : null ,
2021-04-10 13:08:22 -07:00
} ;
2021-07-20 15:45:51 -07:00
if ( ! params . user && ! params . interactive ) {
2021-04-10 13:08:22 -07:00
params . user = await promptInput ( "Enter username: " ) ;
}
2021-07-20 15:45:51 -07:00
if ( ! params . password && ! params . interactive ) {
2021-04-10 13:08:22 -07:00
params . password = await promptInput ( "Enter password: " , true ) ;
}
const browser = await puppeteer . launch ( args ) ;
const page = await browser . newPage ( ) ;
const waitUntil = [ "load" , "networkidle2" ] ;
await page . setCacheEnabled ( false ) ;
2022-02-20 22:22:19 -08:00
if ( params . interactive ) {
2022-03-18 10:32:59 -07:00
await page . evaluateOnNewDocument ( "Object.defineProperty(navigator, \"webdriver\", {value: false});" ) ;
2022-02-20 22:22:19 -08:00
// for testing, inject browsertrix-behaviors
await page . evaluateOnNewDocument ( behaviors + ";\nself.__bx_behaviors.init();" ) ;
}
2021-04-10 13:08:22 -07:00
console . log ( "loading" ) ;
await page . goto ( params . url , { waitUntil } ) ;
2022-05-05 14:27:17 -05:00
2021-04-10 13:08:22 -07:00
console . log ( "loaded" ) ;
2021-07-20 15:45:51 -07:00
if ( params . interactive ) {
2022-05-05 14:27:17 -05:00
new InteractiveBrowser ( params , browser , page ) ;
2021-07-20 15:45:51 -07:00
return ;
}
2022-05-05 14:27:17 -05:00
2021-04-10 13:08:22 -07:00
let u , p ;
try {
2021-04-30 12:31:14 -07:00
u = await page . waitForXPath ( "//input[contains(@name, 'user') or contains(@name, 'email')]" ) ;
2021-04-10 13:08:22 -07:00
p = await page . waitForXPath ( "//input[contains(@name, 'pass') and @type='password']" ) ;
} catch ( e ) {
if ( params . debugScreenshot ) {
await page . screenshot ( { path : params . debugScreenshot } ) ;
}
console . log ( "Login form could not be found" ) ;
await page . close ( ) ;
process . exit ( 1 ) ;
return ;
}
await u . type ( params . user ) ;
await p . type ( params . password ) ;
await Promise . allSettled ( [
p . press ( "Enter" ) ,
page . waitForNavigation ( { waitUntil } )
] ) ;
if ( params . debugScreenshot ) {
await page . screenshot ( { path : params . debugScreenshot } ) ;
}
2021-07-20 15:45:51 -07:00
await createProfile ( params , browser , page ) ;
process . exit ( 0 ) ;
}
2022-05-05 14:27:17 -05:00
async function createProfile ( params , browser , page , targetFilename = "" ) {
2022-08-17 21:40:10 -07:00
await page . _client ( ) . send ( "Network.clearBrowserCache" ) ;
2021-07-20 15:45:51 -07:00
2021-04-10 13:08:22 -07:00
await browser . close ( ) ;
console . log ( "creating profile" ) ;
2022-09-28 15:49:52 -04:00
const profileFilename = params . filename || "/crawls/profiles/profile.tar.gz" ;
const outputDir = path . dirname ( profileFilename ) ;
if ( outputDir && ! fs . existsSync ( outputDir ) ) {
fs . mkdirSync ( outputDir , { recursive : true } ) ;
}
2021-04-10 13:08:22 -07:00
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
saveProfile ( profileFilename ) ;
2022-05-05 14:27:17 -05:00
let resource = { } ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
const storage = initStorage ( ) ;
2022-05-05 14:27:17 -05:00
if ( storage ) {
console . log ( "Uploading to remote storage..." ) ;
resource = await storage . uploadFile ( profileFilename , targetFilename ) ;
}
2021-04-10 13:08:22 -07:00
console . log ( "done" ) ;
2022-05-05 14:27:17 -05:00
return resource ;
2021-04-10 13:08:22 -07:00
}
function promptInput ( msg , hidden = false ) {
const rl = readline . createInterface ( {
input : process . stdin ,
output : process . stdout
} ) ;
if ( hidden ) {
// from https://stackoverflow.com/a/59727173
rl . input . on ( "keypress" , function ( ) {
// get the number of characters entered so far:
const len = rl . line . length ;
// move cursor back to the beginning of the input:
readline . moveCursor ( rl . output , - len , 0 ) ;
// clear everything to the right of the cursor:
readline . clearLine ( rl . output , 1 ) ;
// replace the original input with asterisks:
for ( let i = 0 ; i < len ; i ++ ) {
rl . output . write ( "*" ) ;
}
} ) ;
}
return new Promise ( ( resolve ) => {
rl . question ( msg , function ( res ) {
rl . close ( ) ;
resolve ( res ) ;
} ) ;
} ) ;
}
2021-07-20 15:45:51 -07:00
2022-05-05 14:27:17 -05:00
class InteractiveBrowser {
constructor ( params , browser , page ) {
console . log ( "Creating Profile Interactively..." ) ;
child _process . spawn ( "socat" , [ "tcp-listen:9222,fork" , "tcp:localhost:9221" ] ) ;
this . params = params ;
this . browser = browser ;
this . page = page ;
const target = page . target ( ) ;
this . targetId = target . _targetId ;
this . originSet = new Set ( ) ;
this . addOrigin ( ) ;
2021-07-20 15:45:51 -07:00
2022-05-18 23:23:32 -07:00
page . on ( "load" , ( ) => this . handlePageLoad ( ) ) ;
2022-05-05 14:27:17 -05:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
page . on ( "popup" , async ( ) => {
2022-08-17 21:40:10 -07:00
await this . page . _client ( ) . send ( "Target.activateTarget" , { targetId : this . targetId } ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
} ) ;
2022-08-17 21:40:10 -07:00
page . _client ( ) . on ( "Page.windowOpen" , async ( resp ) => {
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
if ( resp . url ) {
await page . goto ( resp . url ) ;
}
} ) ;
2022-05-05 14:27:17 -05:00
this . shutdownWait = params . shutdownWait * 1000 ;
if ( this . shutdownWait ) {
this . shutdownTimer = setTimeout ( ( ) => process . exit ( 0 ) , this . shutdownWait ) ;
console . log ( ` Shutting down in ${ this . shutdownWait } ms if no ping received ` ) ;
} else {
this . shutdownTimer = 0 ;
}
const httpServer = http . createServer ( ( req , res ) => this . handleRequest ( req , res ) ) ;
const port = 9223 ;
httpServer . listen ( port ) ;
console . log ( ` Browser Profile UI Server started. Load http://localhost: ${ port } / to interact with a Chromium-based browser, click 'Create Profile' when done. ` ) ;
}
2022-05-18 23:23:32 -07:00
handlePageLoad ( ) {
this . addOrigin ( ) ;
this . saveCookiesFor ( this . page . url ( ) ) ;
}
async saveAllCookies ( ) {
console . log ( "Saving all cookies" ) ;
for ( const origin of this . originSet . values ( ) ) {
await this . saveCookiesFor ( origin + "/" ) ;
}
}
async saveCookiesFor ( url ) {
try {
if ( this . params . cookieDays <= 0 ) {
return ;
}
const cookies = await this . page . cookies ( url ) ;
for ( const cookie of cookies ) {
cookie . url = url ;
2022-06-01 09:10:27 -07:00
cookie . expires = ( new Date ( ) . getTime ( ) / 1000 ) + this . params . cookieDays * 86400 ;
2022-05-18 23:23:32 -07:00
delete cookie . size ;
delete cookie . session ;
if ( cookie . sameSite && cookie . sameSite !== "Lax" && cookie . sameSite !== "Strict" ) {
delete cookie . sameSite ;
}
}
await this . page . setCookie ( ... cookies ) ;
} catch ( e ) {
console . log ( "Save Cookie Error: " + e ) ;
}
}
2022-05-05 14:27:17 -05:00
addOrigin ( ) {
const url = this . page . url ( ) ;
console . log ( "Adding origin for" , url ) ;
if ( url . startsWith ( "http:" ) || url . startsWith ( "https:" ) ) {
this . originSet . add ( new URL ( url ) . origin ) ;
}
}
async handleRequest ( req , res ) {
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
const parsedUrl = new URL ( req . url , ` http:// ${ req . headers . host } ` ) ;
const pathname = parsedUrl . pathname ;
2022-05-05 14:27:17 -05:00
let targetUrl ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
let origins ;
2022-05-05 14:27:17 -05:00
switch ( pathname ) {
case "/" :
targetUrl = ` http:// $ HOST:9222/devtools/inspector.html?ws= $ HOST:9222/devtools/page/ ${ this . targetId } &panel=resources ` ;
2021-07-20 15:45:51 -07:00
res . writeHead ( 200 , { "Content-Type" : "text/html" } ) ;
2022-03-18 10:32:59 -07:00
res . end ( profileHTML . replace ( "$DEVTOOLS_SRC" , targetUrl . replaceAll ( "$HOST" , parsedUrl . hostname ) ) ) ;
2022-05-05 14:27:17 -05:00
return ;
2021-07-20 15:45:51 -07:00
2022-05-05 14:27:17 -05:00
case "/ping" :
if ( this . shutdownWait ) {
clearInterval ( this . shutdownTimer ) ;
this . shutdownTimer = setTimeout ( ( ) => process . exit ( 0 ) , this . shutdownWait ) ;
console . log ( ` Ping received, delaying shutdown for ${ this . shutdownWait } ms ` ) ;
}
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
origins = Array . from ( this . originSet . values ( ) ) ;
2022-05-05 14:27:17 -05:00
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
res . end ( JSON . stringify ( { pong : true , origins } ) ) ;
2022-05-05 14:27:17 -05:00
return ;
case "/target" :
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { targetId : this . targetId } ) ) ;
return ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
case "/navigate" :
2022-05-05 14:27:17 -05:00
if ( req . method !== "POST" ) {
break ;
}
try {
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
const postData = await this . readBodyJson ( req ) ;
const url = new URL ( postData . url ) . href ;
2022-05-05 14:27:17 -05:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { success : true } ) ) ;
2022-05-05 14:27:17 -05:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
this . page . goto ( url ) ;
2022-05-05 14:27:17 -05:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
} catch ( e ) {
res . writeHead ( 400 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { "error" : e . toString ( ) } ) ) ;
console . log ( e ) ;
}
return ;
2022-05-05 14:27:17 -05:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
case "/createProfileJS" :
if ( req . method !== "POST" ) {
break ;
}
2022-05-05 14:27:17 -05:00
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
try {
const postData = await this . readBodyJson ( req ) ;
const targetFilename = postData . filename || "" ;
2022-05-18 23:23:32 -07:00
await this . saveAllCookies ( ) ;
2022-05-05 14:27:17 -05:00
const resource = await createProfile ( this . params , this . browser , this . page , targetFilename ) ;
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
origins = Array . from ( this . originSet . values ( ) ) ;
2022-05-05 14:27:17 -05:00
res . writeHead ( 200 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { resource , origins } ) ) ;
} catch ( e ) {
res . writeHead ( 500 , { "Content-Type" : "application/json" } ) ;
res . end ( JSON . stringify ( { "error" : e . toString ( ) } ) ) ;
console . log ( e ) ;
}
2021-07-20 15:45:51 -07:00
2022-05-05 14:27:17 -05:00
setTimeout ( ( ) => process . exit ( 0 ) , 200 ) ;
return ;
case "/createProfile" :
if ( req . method !== "POST" ) {
break ;
}
2021-07-20 15:45:51 -07:00
try {
2022-05-18 23:23:32 -07:00
await this . saveAllCookies ( ) ;
2022-05-05 14:27:17 -05:00
await createProfile ( this . params , this . browser , this . page ) ;
2021-07-20 15:45:51 -07:00
res . writeHead ( 200 , { "Content-Type" : "text/html" } ) ;
res . end ( "<html><body>Profile Created! You may now close this window.</body></html>" ) ;
} catch ( e ) {
res . writeHead ( 500 , { "Content-Type" : "text/html" } ) ;
res . end ( "<html><body>Profile creation failed! See the browsertrix-crawler console for more info" ) ;
console . log ( e ) ;
}
setTimeout ( ( ) => process . exit ( 0 ) , 200 ) ;
2022-05-05 14:27:17 -05:00
return ;
2021-07-20 15:45:51 -07:00
}
2022-05-05 14:27:17 -05:00
res . writeHead ( 404 , { "Content-Type" : "text/html" } ) ;
res . end ( "Not Found" ) ;
}
Health Check + Size Limits + Profile fixes (#138)
- Add optional health check via `--healthCheckPort`. If set, runs a server on designated port that returns 200 if healthcheck succeeds (num of consecutive failed page loads < 2*num workers), or 503 if fails. Useful for k8s health check
- Add crawl size limit (in bytes), via `--sizeLimit`. Crawl exits (and state optionally saved) when size limit is exceeded.
- Add crawl total time limit (in seconds), via `--timeLimit`. Crawl exists (and state optionally saved) when total running time is exceeded.
- Add option to overwrite existing collection. If `--overwrite` is included, any existing data for specified collection is deleted.
- S3 Storage refactor, simplify, don't add additional paths by default.
- Add interpolateFilename as generic utility, supported in filename and STORE_PATH env value.
- wacz save: reenable wacz validation after save.
- Profiles: support /navigate endpoint, return origins from /ping, prevent opening new tabs.
- bump to 0.6.0-beta.1
2022-05-18 22:51:55 -07:00
async readBodyJson ( req ) {
const buffers = [ ] ;
for await ( const chunk of req ) {
buffers . push ( chunk ) ;
}
const data = Buffer . concat ( buffers ) . toString ( ) ;
if ( data . length ) {
try {
return JSON . parse ( data ) || { } ;
} catch ( e ) {
return { } ;
}
}
}
2021-07-20 15:45:51 -07:00
}
2022-05-05 14:27:17 -05:00
2021-04-10 13:08:22 -07:00
main ( ) ;