dockerfile: add symlink to 'google-chrome'

crawler: get version for user-agent via 'google-chrome --product-version'
compose: build versionned image, version 0.1.0
This commit is contained in:
Ilya Kreymer 2020-11-03 17:16:29 +00:00
parent 5bf64be018
commit 7a13535d78
3 changed files with 15 additions and 4 deletions

View file

@ -33,6 +33,7 @@ ADD uwsgi.ini /app/
ADD *.js /app/ ADD *.js /app/
RUN ln -s /app/main.js /usr/bin/crawl RUN ln -s /app/main.js /usr/bin/crawl
RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome
WORKDIR /crawls WORKDIR /crawls

View file

@ -7,7 +7,8 @@ const path = require("path");
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
const CHROME_PATH = "google-chrome";
// to ignore HTTPS error for HEAD check // to ignore HTTPS error for HEAD check
const HTTPS_AGENT = require("https").Agent({ const HTTPS_AGENT = require("https").Agent({
@ -20,7 +21,7 @@ const HTTP_AGENT = require("http").Agent();
// ============================================================================ // ============================================================================
class Crawler { class Crawler {
constructor() { constructor() {
this.headers = {"User-Agent": CHROME_USER_AGENT}; this.headers = {};
this.seenList = new Set(); this.seenList = new Set();
@ -44,6 +45,15 @@ class Crawler {
bootstrap() { bootstrap() {
const opts = {stdio: "ignore", cwd: this.params.cwd}; const opts = {stdio: "ignore", cwd: this.params.cwd};
let version = "84";
try {
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
} catch(e) {}
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
this.headers = {"User-Agent": this.userAgent};
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}); child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts); child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
@ -245,7 +255,7 @@ class Crawler {
// Puppeter Options // Puppeter Options
return { return {
headless: this.params.headless, headless: this.params.headless,
executablePath: "/opt/google/chrome/google-chrome", executablePath: CHROME_PATH,
ignoreHTTPSErrors: true, ignoreHTTPSErrors: true,
args: this.chromeArgs args: this.chromeArgs
}; };

View file

@ -2,7 +2,7 @@ version: '3.5'
services: services:
crawler: crawler:
image: webrecorder/browsertrix-crawler image: webrecorder/browsertrix-crawler:0.1.0
build: build:
context: ./ context: ./