Add --netIdleWait, bump dependencies (0.7.0-beta.2) (#145)

- add --netIdleWait option, default to 10 seconds - necessary for some sites that start fetching immediately after page load
- add openssl.conf to allow pywb to avoid 'unsafe legacy renegotiation disabled' from openssl
- update to browsertrix-behaviors 0.3.2
- update current url for screencasting of page before page load starts
bump to 0.7.0-beta.2
This commit is contained in:
Ilya Kreymer 2022-07-08 17:17:46 -07:00 committed by GitHub
parent bd10f1ad8c
commit e3b8b5ba21
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 62 additions and 14 deletions

View file

@ -13,7 +13,8 @@ ENV PROXY_HOST=localhost \
DISPLAY=:99 \
GEOMETRY=1360x1020x16 \
BROWSER_VERSION=${BROWSER_VERSION} \
BROWSER_BIN=google-chrome
BROWSER_BIN=google-chrome \
OPENSSL_CONF=/app/openssl.conf
WORKDIR /app
@ -28,10 +29,11 @@ ARG REBUILD
RUN yarn install
ADD uwsgi.ini /app/
ADD *.js /app/
ADD util/*.js /app/util/
COPY config.yaml /app/
ADD config/ /app/
ADD html/ /app/html/
RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/bin/create-login-profile

10
config/openssl.conf Normal file
View file

@ -0,0 +1,10 @@
openssl_conf = openssl_init
[openssl_init]
ssl_conf = ssl_sect
[ssl_sect]
system_default = system_default_sect
[system_default_sect]
Options = UnsafeLegacyRenegotiation

View file

@ -103,6 +103,7 @@ class Crawler {
this.done = false;
this.sizeExceeded = false;
this.finalExit = false;
this.behaviorLastLine = null;
}
statusLog(...args) {
@ -264,7 +265,7 @@ class Crawler {
handleSIGTERM: false,
handleSIGHUP: false,
ignoreHTTPSErrors: true,
args: chromeArgs(true, this.userAgent),
args: chromeArgs(!process.env.NO_PROXY, this.userAgent),
userDataDir: this.profileDir,
defaultViewport: null,
};
@ -300,9 +301,15 @@ class Crawler {
}
_behaviorLog({data, type}) {
let behaviorLine;
switch (type) {
case "info":
console.log(JSON.stringify(data));
behaviorLine = JSON.stringify(data);
if (behaviorLine != this._behaviorLastLine) {
console.log(behaviorLine);
this._behaviorLastLine = behaviorLine;
}
break;
case "debug":
@ -316,7 +323,7 @@ class Crawler {
async crawlPage({page, data}) {
try {
if (this.screencaster) {
await this.screencaster.screencastTarget(page.target());
await this.screencaster.screencastTarget(page.target(), data.url);
}
if (this.emulateDevice) {
@ -353,6 +360,9 @@ class Crawler {
console.log("Skipping behaviors for non-HTML page");
} else {
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
// also wait for general net idle
await this.netIdle(page);
}
}
@ -701,6 +711,8 @@ class Crawler {
await this.checkCF(page);
await this.netIdle(page);
// skip extraction if at max depth
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
return;
@ -712,6 +724,22 @@ class Crawler {
}
}
async netIdle(page) {
if (!this.params.netIdleWait) {
return;
}
// in case page starts loading via fetch/xhr immediately after page load,
// we want to ensure we don't exit too early
await this.sleep(0.5);
try {
await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000});
} catch (e) {
console.log("note: waitForNetworkIdle timed out, ignoring");
// ignore, continue
}
}
async extractLinks(page, {selector = "a[href]", extract = "href", isAttribute = false} = {}) {
const results = [];

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.7.0-beta.1",
"version": "0.7.0-beta.2",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
@ -10,7 +10,7 @@
},
"dependencies": {
"abort-controller": "^3.0.0",
"browsertrix-behaviors": "^0.3.0",
"browsertrix-behaviors": "^0.3.2",
"get-folder-size": "2",
"ioredis": "^4.27.1",
"js-yaml": "^4.1.0",

View file

@ -287,6 +287,12 @@ class ArgParser {
type: "boolean",
default: false
},
"netIdleWait": {
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds)",
type: "number",
default: 10
}
};
}

View file

@ -189,9 +189,11 @@ class ScreenCaster
context.__destroy_added = true;
}
async screencastTarget(target) {
async screencastTarget(target, currUrl) {
const id = target._targetId;
this.urls.set(id, currUrl);
if (this.targets.has(id)) {
return;
}
@ -201,7 +203,7 @@ class ScreenCaster
const cdp = await target.createCDPSession();
this.targets.set(id, cdp);
this.urls.set(id, target.url());
//this.urls.set(id, target.url());
const msg = "screencast";

View file

@ -1159,10 +1159,10 @@ browserslist@^4.14.5:
escalade "^3.1.1"
node-releases "^1.1.71"
browsertrix-behaviors@^0.3.0:
version "0.3.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.3.0.tgz#6f424006d9393b760199c144777c529520b24db3"
integrity sha512-FI9JuGk15LJ/ofTth3uPWUQHEaeVDCK3l7UfsmGuchH9E1YE8KRocmaCCpMNGgd5ABo3Ymg5BWAFe30eX53u7A==
browsertrix-behaviors@^0.3.2:
version "0.3.2"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.3.2.tgz#730c470916f1d383df7f2455a58f772f66cb31f4"
integrity sha512-bWvw8yx+g9PPxvAcHUwXj93AHoKI62qIoLXhj/gjUNBw9a8ueXICG5HRGx0UasRcq9WCDvi094016nLdJiKAbQ==
bser@2.1.1:
version "2.1.1"