2020-10-31 13:16:37 -07:00
const puppeteer = require ( "puppeteer-core" ) ;
const { Cluster } = require ( "puppeteer-cluster" ) ;
const child _process = require ( "child_process" ) ;
const fetch = require ( "node-fetch" ) ;
const AbortController = require ( "abort-controller" ) ;
2020-11-01 19:22:53 -08:00
const path = require ( "path" ) ;
2020-12-02 16:26:20 +00:00
const fs = require ( "fs" ) ;
2021-02-04 00:28:32 -05:00
const Sitemapper = require ( "sitemapper" ) ;
const { v4 : uuidv4 } = require ( "uuid" ) ;
2021-03-31 13:41:27 -04:00
const warcio = require ( "warcio" ) ;
2021-02-08 22:21:34 -08:00
2021-03-13 16:48:31 -08:00
const TextExtract = require ( "./textextract" ) ;
const behaviors = fs . readFileSync ( "/app/node_modules/browsertrix-behaviors/dist/behaviors.js" , "utf-8" ) ;
2021-02-08 22:21:34 -08:00
2020-10-31 13:16:37 -07:00
const HTML _TYPES = [ "text/html" , "application/xhtml" , "application/xhtml+xml" ] ;
const WAIT _UNTIL _OPTS = [ "load" , "domcontentloaded" , "networkidle0" , "networkidle2" ] ;
2020-11-03 17:16:29 +00:00
2021-03-13 16:48:31 -08:00
const BEHAVIOR _LOG _FUNC = "__bx_log" ;
2020-11-03 17:16:29 +00:00
const CHROME _PATH = "google-chrome" ;
2020-10-31 13:16:37 -07:00
// to ignore HTTPS error for HEAD check
const HTTPS _AGENT = require ( "https" ) . Agent ( {
rejectUnauthorized : false ,
} ) ;
2020-11-01 19:22:53 -08:00
const HTTP _AGENT = require ( "http" ) . Agent ( ) ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
// ============================================================================
class Crawler {
constructor ( ) {
2020-11-03 17:16:29 +00:00
this . headers = { } ;
2020-11-01 19:22:53 -08:00
this . seenList = new Set ( ) ;
2020-11-14 19:32:31 +00:00
this . emulateDevice = null ;
2020-11-01 19:22:53 -08:00
// links crawled counter
this . numLinks = 0 ;
2021-01-29 18:26:55 +00:00
// was the limit hit?
this . limitHit = false ;
2020-11-14 19:32:31 +00:00
this . userAgent = "" ;
2021-03-13 16:48:31 -08:00
this . behaviorsLogDebug = false ;
2020-11-14 19:32:31 +00:00
2020-11-01 19:22:53 -08:00
const params = require ( "yargs" )
. usage ( "browsertrix-crawler [options]" )
. option ( this . cliOpts )
. check ( ( argv ) => this . validateArgs ( argv ) ) . argv ;
console . log ( "Exclusions Regexes: " , params . exclude ) ;
console . log ( "Scope Regexes: " , params . scope ) ;
this . params = params ;
this . capturePrefix = ` http:// ${ process . env . PROXY _HOST } : ${ process . env . PROXY _PORT } / ${ this . params . collection } /record/id_/ ` ;
2021-02-04 00:28:32 -05:00
// root collections dir
this . collDir = path . join ( this . params . cwd , "collections" , this . params . collection ) ;
// pages directory
this . pagesDir = path . join ( this . collDir , "pages" ) ;
// pages file
this . pagesFile = path . join ( this . pagesDir , "pages.jsonl" ) ;
2020-11-01 19:22:53 -08:00
}
2020-11-14 19:32:31 +00:00
configureUA ( ) {
// override userAgent
if ( this . params . userAgent ) {
if ( this . emulateDevice ) {
this . emulateDevice . userAgent = this . params . userAgent ;
}
this . userAgent = this . params . userAgent ;
return ;
}
// if device set, it overrides the default Chrome UA
if ( this . emulateDevice ) {
this . userAgent = this . emulateDevice . userAgent ;
} else {
2021-02-03 22:24:38 -08:00
let version = process . env . BROWSER _VERSION ;
2020-11-14 19:32:31 +00:00
try {
version = child _process . execFileSync ( "google-chrome" , [ "--product-version" ] , { encoding : "utf8" } ) . trim ( ) ;
2021-02-04 00:28:32 -05:00
} catch ( e ) {
console . log ( e ) ;
}
2020-11-14 19:32:31 +00:00
this . userAgent = ` Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ ${ version } Safari/537.36 ` ;
}
// suffix to append to default userAgent
if ( this . params . userAgentSuffix ) {
this . userAgent += " " + this . params . userAgentSuffix ;
if ( this . emulateDevice ) {
this . emulateDevice . userAgent += " " + this . params . userAgentSuffix ;
}
}
}
2021-04-07 20:24:01 -04:00
2020-11-01 19:22:53 -08:00
bootstrap ( ) {
2021-03-31 13:41:27 -04:00
let opts = { } ;
2021-03-13 16:48:31 -08:00
if ( this . params . logging . includes ( "pywb" ) ) {
2021-03-04 15:36:58 -05:00
opts = { stdio : "inherit" , cwd : this . params . cwd } ;
}
else {
opts = { stdio : "ignore" , cwd : this . params . cwd } ;
}
2020-11-01 19:22:53 -08:00
2020-11-14 20:51:07 +00:00
this . configureUA ( ) ;
2021-04-07 20:24:01 -04:00
2020-11-03 17:16:29 +00:00
this . headers = { "User-Agent" : this . userAgent } ;
2020-11-01 19:22:53 -08:00
child _process . spawn ( "redis-server" , { ... opts , cwd : "/tmp/" } ) ;
2021-03-04 15:36:58 -05:00
2020-11-01 19:22:53 -08:00
child _process . spawnSync ( "wb-manager" , [ "init" , this . params . collection ] , opts ) ;
2021-03-31 13:41:27 -04:00
opts . env = { ... process . env , COLL : this . params . collection , ROLLOVER _SIZE : this . params . rolloverSize } ;
2020-11-01 19:22:53 -08:00
child _process . spawn ( "uwsgi" , [ path . join ( _ _dirname , "uwsgi.ini" ) ] , opts ) ;
if ( ! this . params . headless ) {
child _process . spawn ( "Xvfb" , [
process . env . DISPLAY ,
"-listen" ,
"tcp" ,
"-screen" ,
"0" ,
process . env . GEOMETRY ,
"-ac" ,
"+extension" ,
"RANDR"
] ) ;
}
}
get cliOpts ( ) {
return {
"url" : {
alias : "u" ,
describe : "The URL to start crawling from" ,
type : "string" ,
2020-11-01 21:35:00 -08:00
demandOption : true ,
2020-11-01 19:22:53 -08:00
} ,
"workers" : {
alias : "w" ,
describe : "The number of workers to run in parallel" ,
default : 1 ,
type : "number" ,
} ,
"newContext" : {
describe : "The context for each new capture, can be a new: page, session or browser." ,
default : "page" ,
type : "string"
} ,
"waitUntil" : {
2021-02-04 22:42:03 -08:00
describe : "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','" ,
default : "load,networkidle0" ,
2020-11-01 19:22:53 -08:00
} ,
"limit" : {
describe : "Limit crawl to this number of pages" ,
default : 0 ,
type : "number" ,
} ,
"timeout" : {
describe : "Timeout for each page to load (in seconds)" ,
default : 90 ,
type : "number" ,
} ,
"scope" : {
describe : "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)" ,
} ,
"exclude" : {
describe : "Regex of page URLs that should be excluded from the crawl."
} ,
"collection" : {
alias : "c" ,
2020-11-01 21:35:00 -08:00
describe : "Collection name to crawl to (replay will be accessible under this name in pywb preview)" ,
2020-11-01 19:22:53 -08:00
type : "string" ,
2021-02-15 23:06:18 -05:00
default : ` capture- ${ new Date ( ) . toISOString ( ) . slice ( 0 , 18 ) } ` . replace ( /:/g , "-" )
2020-11-01 19:22:53 -08:00
} ,
"headless" : {
describe : "Run in headless mode, otherwise start xvfb" ,
type : "boolean" ,
default : false ,
} ,
"driver" : {
describe : "JS driver for the crawler" ,
type : "string" ,
default : path . join ( _ _dirname , "defaultDriver.js" ) ,
} ,
2020-11-01 21:35:00 -08:00
"generateCDX" : {
2021-02-17 12:37:07 -05:00
alias : [ "generatecdx" , "generateCdx" ] ,
2020-11-01 19:22:53 -08:00
describe : "If set, generate index (CDXJ) for use with pywb after crawl is done" ,
type : "boolean" ,
default : false ,
2020-11-02 15:28:19 +00:00
} ,
2021-02-17 12:37:07 -05:00
2021-03-31 13:41:27 -04:00
"combineWARC" : {
alias : [ "combinewarc" , "combineWarc" ] ,
describe : "If set, combine the warcs" ,
type : "boolean" ,
default : false ,
} ,
"rolloverSize" : {
describe : "If set, declare the rollover size" ,
default : 1000000000 ,
type : "number" ,
} ,
2021-02-04 00:28:32 -05:00
"generateWACZ" : {
2021-02-17 12:37:07 -05:00
alias : [ "generatewacz" , "generateWacz" ] ,
2021-02-04 00:28:32 -05:00
describe : "If set, generate wacz" ,
type : "boolean" ,
default : false ,
} ,
2021-03-13 16:48:31 -08:00
"logging" : {
describe : "Logging options for crawler, can include: stats, pywb, behaviors" ,
type : "string" ,
default : "stats" ,
2021-03-04 15:36:58 -05:00
} ,
2021-03-13 16:48:31 -08:00
2021-02-23 16:52:54 -05:00
"text" : {
describe : "If set, extract text to the pages.jsonl file" ,
type : "boolean" ,
default : false ,
} ,
2020-11-02 15:28:19 +00:00
"cwd" : {
2021-02-04 22:42:03 -08:00
describe : "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()" ,
2020-11-02 15:28:19 +00:00
type : "string" ,
default : process . cwd ( ) ,
2020-11-14 19:32:31 +00:00
} ,
"mobileDevice" : {
describe : "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts" ,
type : "string" ,
} ,
"userAgent" : {
describe : "Override user-agent with specified string" ,
type : "string" ,
} ,
"userAgentSuffix" : {
describe : "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)" ,
type : "string" ,
2020-11-14 21:55:02 +00:00
} ,
"useSitemap" : {
describe : "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified" ,
2020-12-02 16:26:20 +00:00
} ,
"statsFilename" : {
describe : "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
2021-02-08 22:21:34 -08:00
} ,
2021-03-13 16:48:31 -08:00
"behaviors" : {
2021-02-08 22:21:34 -08:00
describe : "Which background behaviors to enable on each page" ,
2021-03-13 16:48:31 -08:00
default : "autoplay,autofetch,siteSpecific" ,
2021-02-08 22:21:34 -08:00
type : "string" ,
} ,
2020-11-01 19:22:53 -08:00
} ;
}
validateUserUrl ( url ) {
url = new URL ( url ) ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
if ( url . protocol !== "http:" && url . protocol != "https:" ) {
throw new Error ( "URL must start with http:// or https://" ) ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
return url . href ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
validateArgs ( argv ) {
if ( argv . url ) {
// Scope for crawl, default to the domain of the URL
// ensure valid url is used (adds trailing slash if missing)
//argv.seeds = [Crawler.validateUserUrl(argv.url)];
argv . url = this . validateUserUrl ( argv . url ) ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
if ( ! argv . scope ) {
//argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
argv . scope = [ new RegExp ( "^" + this . rxEscape ( argv . url . slice ( 0 , argv . url . lastIndexOf ( "/" ) + 1 ) ) ) ] ;
}
2021-04-07 20:24:01 -04:00
// Check that the collection name is valid.
if ( argv . collection . search ( /^[\w][\w-]*$/ ) === - 1 ) {
throw new Error ( ` \n ${ argv . collection } is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ] \n ` ) ;
}
2020-11-01 19:22:53 -08:00
argv . timeout *= 1000 ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
2021-02-04 22:42:03 -08:00
// can be multiple separate by comma
2020-11-01 19:22:53 -08:00
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
2021-02-04 22:42:03 -08:00
argv . waitUntil = argv . waitUntil . split ( "," ) ;
for ( const opt of argv . waitUntil ) {
if ( ! WAIT _UNTIL _OPTS . includes ( opt ) ) {
throw new Error ( "Invalid waitUntil option, must be one of: " + WAIT _UNTIL _OPTS . join ( "," ) ) ;
}
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
2021-03-13 16:48:31 -08:00
// log options
argv . logging = argv . logging . split ( "," ) ;
2021-02-08 22:21:34 -08:00
// background behaviors to apply
2021-03-13 16:48:31 -08:00
const behaviorOpts = { } ;
argv . behaviors . split ( "," ) . forEach ( ( x ) => behaviorOpts [ x ] = true ) ;
if ( argv . logging . includes ( "behaviors" ) ) {
behaviorOpts . log = BEHAVIOR _LOG _FUNC ;
} else if ( argv . logging . includes ( "behaviors-debug" ) ) {
behaviorOpts . log = BEHAVIOR _LOG _FUNC ;
this . behaviorsLogDebug = true ;
}
this . behaviorOpts = JSON . stringify ( behaviorOpts ) ;
2021-02-08 22:21:34 -08:00
2020-11-01 19:22:53 -08:00
if ( ! argv . newContext ) {
argv . newContext = "page" ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
switch ( argv . newContext ) {
2020-10-31 13:16:37 -07:00
case "page" :
2020-11-01 19:22:53 -08:00
argv . newContext = Cluster . CONCURRENCY _PAGE ;
2020-10-31 13:16:37 -07:00
break ;
case "session" :
2020-11-01 19:22:53 -08:00
argv . newContext = Cluster . CONCURRENCY _CONTEXT ;
2020-10-31 13:16:37 -07:00
break ;
case "browser" :
2020-11-01 19:22:53 -08:00
argv . newContext = Cluster . CONCURRENCY _BROWSER ;
2020-10-31 13:16:37 -07:00
break ;
2020-11-01 19:22:53 -08:00
default :
throw new Error ( "Invalid newContext, must be one of: page, session, browser" ) ;
}
2020-10-31 13:16:37 -07:00
2020-11-14 19:32:31 +00:00
if ( argv . mobileDevice ) {
this . emulateDevice = puppeteer . devices [ argv . mobileDevice ] ;
if ( ! this . emulateDevice ) {
throw new Error ( "Unknown device: " + argv . mobileDevice ) ;
}
}
2020-11-14 21:55:02 +00:00
if ( argv . useSitemap === true ) {
const url = new URL ( argv . url ) ;
url . pathname = "/sitemap.xml" ;
argv . useSitemap = url . href ;
}
2020-11-01 19:22:53 -08:00
// Support one or multiple exclude
if ( argv . exclude ) {
if ( typeof ( argv . exclude ) === "string" ) {
argv . exclude = [ new RegExp ( argv . exclude ) ] ;
} else {
argv . exclude = argv . exclude . map ( e => new RegExp ( e ) ) ;
}
} else {
argv . exclude = [ ] ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
// Support one or multiple scopes
if ( argv . scope ) {
if ( typeof ( argv . scope ) === "string" ) {
argv . scope = [ new RegExp ( argv . scope ) ] ;
} else {
argv . scope = argv . scope . map ( e => new RegExp ( e ) ) ;
}
} else {
argv . scope = [ ] ;
2020-10-31 13:16:37 -07:00
}
2020-12-02 16:26:20 +00:00
// Resolve statsFilename
if ( argv . statsFilename ) {
argv . statsFilename = path . resolve ( argv . cwd , argv . statsFilename ) ;
}
2020-11-01 19:22:53 -08:00
return true ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
get chromeArgs ( ) {
// Chrome Flags, including proxy server
return [
"--no-xshm" , // needed for Chrome >80 (check if puppeteer adds automatically)
` --proxy-server=http:// ${ process . env . PROXY _HOST } : ${ process . env . PROXY _PORT } ` ,
"--no-sandbox" ,
"--disable-background-media-suspend" ,
"--autoplay-policy=no-user-gesture-required" ,
2021-01-29 00:33:01 -08:00
"--disable-features=IsolateOrigins,site-per-process" ,
2020-11-01 19:22:53 -08:00
] ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
get puppeteerArgs ( ) {
// Puppeter Options
return {
headless : this . params . headless ,
2020-11-03 17:16:29 +00:00
executablePath : CHROME _PATH ,
2020-11-01 19:22:53 -08:00
ignoreHTTPSErrors : true ,
args : this . chromeArgs
} ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
async run ( ) {
this . bootstrap ( ) ;
2020-10-31 13:16:37 -07:00
try {
2020-11-01 19:22:53 -08:00
await this . crawl ( ) ;
process . exit ( 0 ) ;
2020-10-31 13:16:37 -07:00
} catch ( e ) {
2020-11-01 19:22:53 -08:00
console . error ( "Crawl failed" ) ;
console . error ( e ) ;
process . exit ( 1 ) ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
}
2021-02-23 16:52:54 -05:00
2021-02-08 22:21:34 -08:00
async crawlPage ( { page , data } ) {
try {
if ( this . emulateDevice ) {
await page . emulate ( this . emulateDevice ) ;
}
2021-03-13 16:48:31 -08:00
if ( this . behaviorOpts ) {
await page . exposeFunction ( BEHAVIOR _LOG _FUNC , ( { data , type } ) => {
switch ( type ) {
2021-03-31 13:41:27 -04:00
case "info" :
console . log ( JSON . stringify ( data ) ) ;
break ;
case "debug" :
default :
if ( this . behaviorsLogDebug ) {
console . log ( "behavior debug: " + JSON . stringify ( data ) ) ;
}
2021-03-13 16:48:31 -08:00
}
} ) ;
await page . evaluateOnNewDocument ( behaviors + `
self . _ _bx _behaviors . init ( $ { this . behaviorOpts } ) ;
` );
}
2021-02-08 22:21:34 -08:00
// run custom driver here
await this . driver ( { page , data , crawler : this } ) ;
2021-02-23 16:52:54 -05:00
2021-02-08 22:21:34 -08:00
const title = await page . title ( ) ;
2021-03-31 13:41:27 -04:00
let text = "" ;
2021-03-13 16:48:31 -08:00
if ( this . params . text ) {
2021-02-23 16:52:54 -05:00
const client = await page . target ( ) . createCDPSession ( ) ;
const result = await client . send ( "DOM.getDocument" , { "depth" : - 1 , "pierce" : true } ) ;
2021-03-04 15:36:58 -05:00
text = await new TextExtract ( result ) . parseTextFromDom ( ) ;
2021-02-23 16:52:54 -05:00
}
this . writePage ( data . url , title , this . params . text , text ) ;
2021-02-08 22:21:34 -08:00
2021-03-13 16:48:31 -08:00
if ( this . behaviorOpts ) {
await Promise . allSettled ( page . frames ( ) . map ( frame => frame . evaluate ( "self.__bx_behaviors.run();" ) ) ) ;
2021-02-08 22:21:34 -08:00
}
this . writeStats ( ) ;
} catch ( e ) {
console . warn ( e ) ;
}
}
2021-03-31 13:41:27 -04:00
async createWARCInfo ( filename ) {
const warcVersion = "WARC/1.1" ;
const type = "warcinfo" ;
const packageFileJSON = JSON . parse ( fs . readFileSync ( "../app/package.json" ) ) ;
const pywb _version = fs . readFileSync ( "/usr/local/lib/python3.8/site-packages/pywb/version.py" , "utf8" ) . split ( "\n" ) [ 0 ] . split ( "=" ) [ 1 ] . trim ( ) . replace ( /['"]+/g , "" ) ;
const warcioPackageJson = JSON . parse ( fs . readFileSync ( "/app/node_modules/warcio/package.json" ) ) ;
const info = {
"software" : ` Browsertrix-Crawler ${ packageFileJSON [ "version" ] } (with warcio.js ${ warcioPackageJson } pywb ${ pywb _version } ) ` ,
"format" : "WARC File Format 1.1"
} ;
const record = await warcio . WARCRecord . createWARCInfo ( { filename , type , warcVersion } , info ) ;
const buffer = await warcio . WARCSerializer . serialize ( record , { gzip : true } ) ;
return buffer ;
}
getFileSize ( filename ) {
var stats = fs . statSync ( filename ) ;
return stats . size ;
}
2021-02-08 22:21:34 -08:00
2020-11-01 19:22:53 -08:00
async crawl ( ) {
2020-10-31 13:16:37 -07:00
try {
2020-11-01 19:22:53 -08:00
this . driver = require ( this . params . driver ) ;
} catch ( e ) {
console . log ( e ) ;
return ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
// Puppeteer Cluster init and options
this . cluster = await Cluster . launch ( {
concurrency : this . params . newContext ,
maxConcurrency : this . params . workers ,
skipDuplicateUrls : true ,
timeout : this . params . timeout * 2 ,
puppeteerOptions : this . puppeteerArgs ,
puppeteer ,
2021-03-13 16:48:31 -08:00
monitor : this . params . logging . includes ( "stats" )
2020-11-01 19:22:53 -08:00
} ) ;
2020-10-31 13:16:37 -07:00
2021-02-08 22:21:34 -08:00
this . cluster . task ( ( opts ) => this . crawlPage ( opts ) ) ;
2020-11-01 19:22:53 -08:00
2021-02-04 00:28:32 -05:00
this . initPages ( ) ;
2020-11-01 19:22:53 -08:00
this . queueUrl ( this . params . url ) ;
2020-11-14 21:55:02 +00:00
if ( this . params . useSitemap ) {
await this . parseSitemap ( this . params . useSitemap ) ;
}
2020-11-01 19:22:53 -08:00
await this . cluster . idle ( ) ;
await this . cluster . close ( ) ;
2020-12-02 16:26:20 +00:00
this . writeStats ( ) ;
2020-11-01 19:22:53 -08:00
// extra wait for all resources to land into WARCs
console . log ( "Waiting 5s to ensure WARCs are finished" ) ;
await this . sleep ( 5000 ) ;
2021-03-31 13:41:27 -04:00
if ( this . params . combineWARC ) {
await this . combineWARC ( ) ;
}
2020-11-01 19:22:53 -08:00
2020-11-03 21:33:19 +00:00
if ( this . params . generateCDX ) {
2020-11-01 19:22:53 -08:00
console . log ( "Generate CDX" ) ;
2020-11-02 15:28:19 +00:00
child _process . spawnSync ( "wb-manager" , [ "reindex" , this . params . collection ] , { stdio : "inherit" , cwd : this . params . cwd } ) ;
2020-10-31 13:16:37 -07:00
}
2021-02-04 00:28:32 -05:00
2021-02-17 12:37:07 -05:00
if ( this . params . generateWACZ || this . params . generateWacz || this . params . generatewacz ) {
2021-02-04 00:28:32 -05:00
console . log ( "Generating WACZ" ) ;
const archiveDir = path . join ( this . collDir , "archive" ) ;
// Get a list of the warcs inside
const warcFileList = fs . readdirSync ( archiveDir ) ;
// Build the argument list to pass to the wacz create command
const waczFilename = this . params . collection . concat ( ".wacz" ) ;
const waczPath = path . join ( this . collDir , waczFilename ) ;
const argument _list = [ "create" , "-o" , waczPath , "--pages" , this . pagesFile , "-f" ] ;
2021-02-17 12:37:07 -05:00
warcFileList . forEach ( ( val , index ) => argument _list . push ( path . join ( archiveDir , val ) ) ) ; // eslint-disable-line no-unused-vars
2021-02-04 00:28:32 -05:00
// Run the wacz create command
child _process . spawnSync ( "wacz" , argument _list ) ;
console . log ( ` WACZ successfully generated and saved to: ${ waczFilename } ` ) ;
}
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
2020-12-02 16:26:20 +00:00
writeStats ( ) {
if ( this . params . statsFilename ) {
const total = this . cluster . allTargetCount ;
const workersRunning = this . cluster . workersBusy . length ;
const numCrawled = total - this . cluster . jobQueue . size ( ) - workersRunning ;
2021-01-29 18:26:55 +00:00
const limit = { max : this . params . limit || 0 , hit : this . limitHit } ;
const stats = { numCrawled , workersRunning , total , limit } ;
2020-12-02 16:26:20 +00:00
try {
2021-02-04 00:28:32 -05:00
fs . writeFileSync ( this . params . statsFilename , JSON . stringify ( stats , null , 2 ) ) ;
2020-12-02 16:26:20 +00:00
} catch ( err ) {
console . warn ( "Stats output failed" , err ) ;
}
}
}
2020-11-01 19:22:53 -08:00
async extractLinks ( page , selector = "a[href]" ) {
2020-10-31 13:16:37 -07:00
let results = null ;
try {
2020-11-01 19:22:53 -08:00
results = await page . evaluate ( ( selector ) => {
/* eslint-disable-next-line no-undef */
return [ ... document . querySelectorAll ( selector ) ] . map ( elem => elem . href ) ;
} , selector ) ;
2020-10-31 13:16:37 -07:00
} catch ( e ) {
console . warn ( "Link Extraction failed" , e ) ;
return ;
}
2020-11-14 21:55:02 +00:00
this . queueUrls ( results ) ;
}
queueUrls ( urls ) {
2020-10-31 13:16:37 -07:00
try {
2020-11-14 21:55:02 +00:00
for ( const url of urls ) {
2020-11-01 19:22:53 -08:00
const captureUrl = this . shouldCrawl ( url ) ;
if ( captureUrl ) {
if ( ! this . queueUrl ( captureUrl ) ) {
2020-10-31 13:16:37 -07:00
break ;
}
}
}
} catch ( e ) {
2020-11-01 19:22:53 -08:00
console . log ( "Queuing Error: " , e ) ;
2020-10-31 13:16:37 -07:00
}
}
2020-11-01 19:22:53 -08:00
queueUrl ( url ) {
this . seenList . add ( url ) ;
if ( this . numLinks >= this . params . limit && this . params . limit > 0 ) {
2021-01-29 18:26:55 +00:00
this . limitHit = true ;
2020-11-01 19:22:53 -08:00
return false ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
this . numLinks ++ ;
this . cluster . queue ( { url } ) ;
return true ;
2020-10-31 13:16:37 -07:00
}
2021-02-04 00:28:32 -05:00
initPages ( ) {
try {
// create pages dir if doesn't exist and write pages.jsonl header
if ( ! fs . existsSync ( this . pagesDir ) ) {
fs . mkdirSync ( this . pagesDir ) ;
2021-03-31 13:41:27 -04:00
const header = { "format" : "json-pages-1.0" , "id" : "pages" , "title" : "All Pages" } ;
2021-02-23 16:52:54 -05:00
if ( this . params . text ) {
console . log ( "creating pages with full text" ) ;
2021-03-31 13:41:27 -04:00
header [ "hasText" ] = true ;
2021-02-23 16:52:54 -05:00
}
else {
console . log ( "creating pages without full text" ) ;
2021-03-31 13:41:27 -04:00
header [ "hasText" ] = false ;
2021-02-23 16:52:54 -05:00
}
2021-03-31 13:41:27 -04:00
const header _formatted = JSON . stringify ( header ) . concat ( "\n" ) ;
2021-02-23 16:52:54 -05:00
fs . writeFileSync ( this . pagesFile , header _formatted ) ;
2021-02-04 00:28:32 -05:00
}
} catch ( err ) {
console . log ( "pages/pages.jsonl creation failed" , err ) ;
}
}
2021-02-23 16:52:54 -05:00
writePage ( url , title , text , text _content ) {
2021-02-04 00:28:32 -05:00
const id = uuidv4 ( ) ;
const row = { "id" : id , "url" : url , "title" : title } ;
2021-02-23 16:52:54 -05:00
if ( text == true ) {
2021-03-31 13:41:27 -04:00
row [ "text" ] = text _content ;
2021-02-23 16:52:54 -05:00
}
2021-02-04 00:28:32 -05:00
const processedRow = JSON . stringify ( row ) . concat ( "\n" ) ;
try {
fs . appendFileSync ( this . pagesFile , processedRow ) ;
}
catch ( err ) {
console . warn ( "pages/pages.jsonl append failed" , err ) ;
}
}
2020-11-01 19:22:53 -08:00
shouldCrawl ( url ) {
try {
url = new URL ( url ) ;
} catch ( e ) {
2020-10-31 13:16:37 -07:00
return false ;
}
2020-11-01 19:22:53 -08:00
// remove hashtag
url . hash = "" ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
// only queue http/https URLs
if ( url . protocol != "http:" && url . protocol != "https:" ) {
return false ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
url = url . href ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
// skip already crawled
if ( this . seenList . has ( url ) ) {
2020-10-31 13:16:37 -07:00
return false ;
}
2020-11-01 19:22:53 -08:00
let inScope = false ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
// check scopes
for ( const s of this . params . scope ) {
if ( s . exec ( url ) ) {
inScope = true ;
break ;
}
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
if ( ! inScope ) {
//console.log(`Not in scope ${url} ${scope}`);
return false ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
// check exclusions
for ( const e of this . params . exclude ) {
if ( e . exec ( url ) ) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false ;
}
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
return url ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
resolveAgent ( urlParsed ) {
return urlParsed . protocol === "https:" ? HTTPS _AGENT : HTTP _AGENT ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
async isHTML ( url ) {
try {
const resp = await fetch ( url , {
method : "HEAD" ,
headers : this . headers ,
agent : this . resolveAgent
} ) ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
if ( resp . status >= 400 ) {
2021-03-13 16:48:31 -08:00
console . log ( ` Skipping HEAD check ${ url } , invalid status ${ resp . status } ` ) ;
return true ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
const contentType = resp . headers . get ( "Content-Type" ) ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
// just load if no content-type
if ( ! contentType ) {
return true ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
const mime = contentType . split ( ";" ) [ 0 ] ;
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
if ( HTML _TYPES . includes ( mime ) ) {
return true ;
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
return false ;
} catch ( e ) {
console . log ( "HTML Check error" , e ) ;
// can't confirm not html, so try in browser
2020-10-31 13:16:37 -07:00
return true ;
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
}
2020-11-01 19:22:53 -08:00
async directFetchCapture ( url ) {
//console.log(`Direct capture: ${this.capturePrefix}${url}`);
const abort = new AbortController ( ) ;
const signal = abort . signal ;
2021-02-08 12:45:46 -05:00
await fetch ( this . capturePrefix + url , { signal , headers : this . headers } ) ;
2020-11-01 19:22:53 -08:00
abort . abort ( ) ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
sleep ( time ) {
return new Promise ( resolve => setTimeout ( resolve , time ) ) ;
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
rxEscape ( string ) {
return string . replace ( /[-/\\^$*+?.()|[\]{}]/g , "\\$&" ) ;
}
2020-11-14 21:55:02 +00:00
async parseSitemap ( url ) {
const sitemapper = new Sitemapper ( {
url ,
timeout : 15000 ,
requestHeaders : this . headers
} ) ;
try {
const { sites } = await sitemapper . fetch ( ) ;
this . queueUrls ( sites ) ;
} catch ( e ) {
console . log ( e ) ;
}
}
2021-03-31 13:41:27 -04:00
async combineWARC ( ) {
console . log ( "Combining the warcs" ) ;
// Get the list of created Warcs
const warcLists = fs . readdirSync ( path . join ( this . collDir , "archive" ) ) ;
const fileSizeObjects = [ ] ; // Used to sort the created warc by fileSize
// Go through a list of the created works and create an array sorted by their filesize with the largest file first.
for ( let i = 0 ; i < warcLists . length ; i ++ ) {
let fileName = path . join ( this . collDir , "archive" , warcLists [ i ] ) ;
let fileSize = this . getFileSize ( fileName ) ;
fileSizeObjects . push ( { "fileSize" : fileSize , "fileName" : fileName } ) ;
fileSizeObjects . sort ( function ( a , b ) {
return b . fileSize - a . fileSize ;
} ) ;
}
const generatedCombinedWarcs = [ ] ;
// Used to name combined warcs, default to -1 for first increment
let combinedWarcNumber = - 1 ;
// write combine WARC to collection root
let combinedWarcFullPath = "" ;
// Iterate through the sorted file size array.
for ( let j = 0 ; j < fileSizeObjects . length ; j ++ ) {
// if need to rollover to new warc
let doRollover = false ;
// set to true for first warc
if ( combinedWarcNumber < 0 ) {
doRollover = true ;
} else {
// Check the size of the existing combined warc.
const currentCombinedWarcSize = this . getFileSize ( combinedWarcFullPath ) ;
// If adding the current warc to the existing combined file creates a file smaller than the rollover size add the data to the combinedWarc
const proposedWarcSize = fileSizeObjects [ j ] . fileSize + currentCombinedWarcSize ;
doRollover = ( proposedWarcSize >= this . params . rolloverSize ) ;
}
if ( doRollover ) {
// If adding the current warc to the existing combined file creates a file larger than the rollover size do the following:
// 1. increment the combinedWarcNumber
// 2. create the name of the new combinedWarcFile
// 3. Write the header out to the new file
// 4. Write out the current warc data to the combinedFile
combinedWarcNumber = combinedWarcNumber + 1 ;
const combinedWarcName = ` ${ this . params . collection } _ ${ combinedWarcNumber } .warc ` ;
// write combined warcs to root collection dir as they're output of a collection (like wacz)
combinedWarcFullPath = path . join ( this . collDir , combinedWarcName ) ;
generatedCombinedWarcs . push ( combinedWarcName ) ;
const warcBuffer = await this . createWARCInfo ( combinedWarcName ) ;
fs . writeFileSync ( combinedWarcFullPath , warcBuffer ) ;
}
fs . appendFileSync ( combinedWarcFullPath , fs . readFileSync ( fileSizeObjects [ j ] . fileName ) ) ;
}
console . log ( ` Combined warcs saved as: ${ generatedCombinedWarcs } ` ) ;
}
2020-11-01 19:22:53 -08:00
}
2020-10-31 13:16:37 -07:00
2020-11-01 19:22:53 -08:00
module . exports . Crawler = Crawler ;