diff --git a/Dockerfile b/Dockerfile index dc0226fd..f1466627 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,6 +33,7 @@ ADD uwsgi.ini /app/ ADD *.js /app/ RUN ln -s /app/main.js /usr/bin/crawl +RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome WORKDIR /crawls diff --git a/crawler.js b/crawler.js index 048a327f..8a9d64a1 100644 --- a/crawler.js +++ b/crawler.js @@ -7,7 +7,8 @@ const path = require("path"); const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; -const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"; + +const CHROME_PATH = "google-chrome"; // to ignore HTTPS error for HEAD check const HTTPS_AGENT = require("https").Agent({ @@ -20,7 +21,7 @@ const HTTP_AGENT = require("http").Agent(); // ============================================================================ class Crawler { constructor() { - this.headers = {"User-Agent": CHROME_USER_AGENT}; + this.headers = {}; this.seenList = new Set(); @@ -44,6 +45,15 @@ class Crawler { bootstrap() { const opts = {stdio: "ignore", cwd: this.params.cwd}; + let version = "84"; + + try { + version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim(); + } catch(e) {} + + this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; + this.headers = {"User-Agent": this.userAgent}; + child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}); child_process.spawnSync("wb-manager", ["init", this.params.collection], opts); @@ -245,7 +255,7 @@ class Crawler { // Puppeter Options return { headless: this.params.headless, - executablePath: "/opt/google/chrome/google-chrome", + executablePath: CHROME_PATH, ignoreHTTPSErrors: true, args: this.chromeArgs }; diff --git a/docker-compose.yml b/docker-compose.yml index 37bda52d..3ffacd8b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.5' services: crawler: - image: webrecorder/browsertrix-crawler + image: webrecorder/browsertrix-crawler:0.1.0 build: context: ./