mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add --netIdleWait, bump dependencies (0.7.0-beta.2) (#145)
- add --netIdleWait option, default to 10 seconds - necessary for some sites that start fetching immediately after page load - add openssl.conf to allow pywb to avoid 'unsafe legacy renegotiation disabled' from openssl - update to browsertrix-behaviors 0.3.2 - update current url for screencasting of page before page load starts bump to 0.7.0-beta.2
This commit is contained in:
parent
bd10f1ad8c
commit
e3b8b5ba21
9 changed files with 62 additions and 14 deletions
|
@ -13,7 +13,8 @@ ENV PROXY_HOST=localhost \
|
|||
DISPLAY=:99 \
|
||||
GEOMETRY=1360x1020x16 \
|
||||
BROWSER_VERSION=${BROWSER_VERSION} \
|
||||
BROWSER_BIN=google-chrome
|
||||
BROWSER_BIN=google-chrome \
|
||||
OPENSSL_CONF=/app/openssl.conf
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
@ -28,10 +29,11 @@ ARG REBUILD
|
|||
|
||||
RUN yarn install
|
||||
|
||||
ADD uwsgi.ini /app/
|
||||
ADD *.js /app/
|
||||
ADD util/*.js /app/util/
|
||||
COPY config.yaml /app/
|
||||
|
||||
ADD config/ /app/
|
||||
|
||||
ADD html/ /app/html/
|
||||
|
||||
RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/bin/create-login-profile
|
||||
|
|
10
config/openssl.conf
Normal file
10
config/openssl.conf
Normal file
|
@ -0,0 +1,10 @@
|
|||
openssl_conf = openssl_init
|
||||
|
||||
[openssl_init]
|
||||
ssl_conf = ssl_sect
|
||||
|
||||
[ssl_sect]
|
||||
system_default = system_default_sect
|
||||
|
||||
[system_default_sect]
|
||||
Options = UnsafeLegacyRenegotiation
|
34
crawler.js
34
crawler.js
|
@ -103,6 +103,7 @@ class Crawler {
|
|||
this.done = false;
|
||||
this.sizeExceeded = false;
|
||||
this.finalExit = false;
|
||||
this.behaviorLastLine = null;
|
||||
}
|
||||
|
||||
statusLog(...args) {
|
||||
|
@ -264,7 +265,7 @@ class Crawler {
|
|||
handleSIGTERM: false,
|
||||
handleSIGHUP: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: chromeArgs(true, this.userAgent),
|
||||
args: chromeArgs(!process.env.NO_PROXY, this.userAgent),
|
||||
userDataDir: this.profileDir,
|
||||
defaultViewport: null,
|
||||
};
|
||||
|
@ -300,9 +301,15 @@ class Crawler {
|
|||
}
|
||||
|
||||
_behaviorLog({data, type}) {
|
||||
let behaviorLine;
|
||||
|
||||
switch (type) {
|
||||
case "info":
|
||||
console.log(JSON.stringify(data));
|
||||
behaviorLine = JSON.stringify(data);
|
||||
if (behaviorLine != this._behaviorLastLine) {
|
||||
console.log(behaviorLine);
|
||||
this._behaviorLastLine = behaviorLine;
|
||||
}
|
||||
break;
|
||||
|
||||
case "debug":
|
||||
|
@ -316,7 +323,7 @@ class Crawler {
|
|||
async crawlPage({page, data}) {
|
||||
try {
|
||||
if (this.screencaster) {
|
||||
await this.screencaster.screencastTarget(page.target());
|
||||
await this.screencaster.screencastTarget(page.target(), data.url);
|
||||
}
|
||||
|
||||
if (this.emulateDevice) {
|
||||
|
@ -353,6 +360,9 @@ class Crawler {
|
|||
console.log("Skipping behaviors for non-HTML page");
|
||||
} else {
|
||||
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
|
||||
|
||||
// also wait for general net idle
|
||||
await this.netIdle(page);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -701,6 +711,8 @@ class Crawler {
|
|||
|
||||
await this.checkCF(page);
|
||||
|
||||
await this.netIdle(page);
|
||||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||
return;
|
||||
|
@ -712,6 +724,22 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async netIdle(page) {
|
||||
if (!this.params.netIdleWait) {
|
||||
return;
|
||||
}
|
||||
// in case page starts loading via fetch/xhr immediately after page load,
|
||||
// we want to ensure we don't exit too early
|
||||
await this.sleep(0.5);
|
||||
|
||||
try {
|
||||
await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000});
|
||||
} catch (e) {
|
||||
console.log("note: waitForNetworkIdle timed out, ignoring");
|
||||
// ignore, continue
|
||||
}
|
||||
}
|
||||
|
||||
async extractLinks(page, {selector = "a[href]", extract = "href", isAttribute = false} = {}) {
|
||||
const results = [];
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.7.0-beta.1",
|
||||
"version": "0.7.0-beta.2",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
@ -10,7 +10,7 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"abort-controller": "^3.0.0",
|
||||
"browsertrix-behaviors": "^0.3.0",
|
||||
"browsertrix-behaviors": "^0.3.2",
|
||||
"get-folder-size": "2",
|
||||
"ioredis": "^4.27.1",
|
||||
"js-yaml": "^4.1.0",
|
||||
|
|
|
@ -287,6 +287,12 @@ class ArgParser {
|
|||
type: "boolean",
|
||||
default: false
|
||||
},
|
||||
|
||||
"netIdleWait": {
|
||||
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds)",
|
||||
type: "number",
|
||||
default: 10
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -189,9 +189,11 @@ class ScreenCaster
|
|||
context.__destroy_added = true;
|
||||
}
|
||||
|
||||
async screencastTarget(target) {
|
||||
async screencastTarget(target, currUrl) {
|
||||
const id = target._targetId;
|
||||
|
||||
this.urls.set(id, currUrl);
|
||||
|
||||
if (this.targets.has(id)) {
|
||||
return;
|
||||
}
|
||||
|
@ -201,7 +203,7 @@ class ScreenCaster
|
|||
const cdp = await target.createCDPSession();
|
||||
|
||||
this.targets.set(id, cdp);
|
||||
this.urls.set(id, target.url());
|
||||
//this.urls.set(id, target.url());
|
||||
|
||||
const msg = "screencast";
|
||||
|
||||
|
|
|
@ -1159,10 +1159,10 @@ browserslist@^4.14.5:
|
|||
escalade "^3.1.1"
|
||||
node-releases "^1.1.71"
|
||||
|
||||
browsertrix-behaviors@^0.3.0:
|
||||
version "0.3.0"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.3.0.tgz#6f424006d9393b760199c144777c529520b24db3"
|
||||
integrity sha512-FI9JuGk15LJ/ofTth3uPWUQHEaeVDCK3l7UfsmGuchH9E1YE8KRocmaCCpMNGgd5ABo3Ymg5BWAFe30eX53u7A==
|
||||
browsertrix-behaviors@^0.3.2:
|
||||
version "0.3.2"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.3.2.tgz#730c470916f1d383df7f2455a58f772f66cb31f4"
|
||||
integrity sha512-bWvw8yx+g9PPxvAcHUwXj93AHoKI62qIoLXhj/gjUNBw9a8ueXICG5HRGx0UasRcq9WCDvi094016nLdJiKAbQ==
|
||||
|
||||
bser@2.1.1:
|
||||
version "2.1.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue