mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
dockerfile: add symlink to 'google-chrome'
crawler: get version for user-agent via 'google-chrome --product-version' compose: build versionned image, version 0.1.0
This commit is contained in:
parent
5bf64be018
commit
7a13535d78
3 changed files with 15 additions and 4 deletions
|
@ -33,6 +33,7 @@ ADD uwsgi.ini /app/
|
|||
ADD *.js /app/
|
||||
|
||||
RUN ln -s /app/main.js /usr/bin/crawl
|
||||
RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome
|
||||
|
||||
WORKDIR /crawls
|
||||
|
||||
|
|
16
crawler.js
16
crawler.js
|
@ -7,7 +7,8 @@ const path = require("path");
|
|||
|
||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
|
||||
|
||||
const CHROME_PATH = "google-chrome";
|
||||
|
||||
// to ignore HTTPS error for HEAD check
|
||||
const HTTPS_AGENT = require("https").Agent({
|
||||
|
@ -20,7 +21,7 @@ const HTTP_AGENT = require("http").Agent();
|
|||
// ============================================================================
|
||||
class Crawler {
|
||||
constructor() {
|
||||
this.headers = {"User-Agent": CHROME_USER_AGENT};
|
||||
this.headers = {};
|
||||
|
||||
this.seenList = new Set();
|
||||
|
||||
|
@ -44,6 +45,15 @@ class Crawler {
|
|||
bootstrap() {
|
||||
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||
|
||||
let version = "84";
|
||||
|
||||
try {
|
||||
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
||||
} catch(e) {}
|
||||
|
||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
this.headers = {"User-Agent": this.userAgent};
|
||||
|
||||
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
||||
|
||||
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
||||
|
@ -245,7 +255,7 @@ class Crawler {
|
|||
// Puppeter Options
|
||||
return {
|
||||
headless: this.params.headless,
|
||||
executablePath: "/opt/google/chrome/google-chrome",
|
||||
executablePath: CHROME_PATH,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: this.chromeArgs
|
||||
};
|
||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
|||
|
||||
services:
|
||||
crawler:
|
||||
image: webrecorder/browsertrix-crawler
|
||||
image: webrecorder/browsertrix-crawler:0.1.0
|
||||
build:
|
||||
context: ./
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue