mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
dockerfile: add symlink to 'google-chrome'
crawler: get version for user-agent via 'google-chrome --product-version' compose: build versionned image, version 0.1.0
This commit is contained in:
parent
5bf64be018
commit
7a13535d78
3 changed files with 15 additions and 4 deletions
|
@ -33,6 +33,7 @@ ADD uwsgi.ini /app/
|
||||||
ADD *.js /app/
|
ADD *.js /app/
|
||||||
|
|
||||||
RUN ln -s /app/main.js /usr/bin/crawl
|
RUN ln -s /app/main.js /usr/bin/crawl
|
||||||
|
RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome
|
||||||
|
|
||||||
WORKDIR /crawls
|
WORKDIR /crawls
|
||||||
|
|
||||||
|
|
16
crawler.js
16
crawler.js
|
@ -7,7 +7,8 @@ const path = require("path");
|
||||||
|
|
||||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||||
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||||
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
|
|
||||||
|
const CHROME_PATH = "google-chrome";
|
||||||
|
|
||||||
// to ignore HTTPS error for HEAD check
|
// to ignore HTTPS error for HEAD check
|
||||||
const HTTPS_AGENT = require("https").Agent({
|
const HTTPS_AGENT = require("https").Agent({
|
||||||
|
@ -20,7 +21,7 @@ const HTTP_AGENT = require("http").Agent();
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class Crawler {
|
class Crawler {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.headers = {"User-Agent": CHROME_USER_AGENT};
|
this.headers = {};
|
||||||
|
|
||||||
this.seenList = new Set();
|
this.seenList = new Set();
|
||||||
|
|
||||||
|
@ -44,6 +45,15 @@ class Crawler {
|
||||||
bootstrap() {
|
bootstrap() {
|
||||||
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||||
|
|
||||||
|
let version = "84";
|
||||||
|
|
||||||
|
try {
|
||||||
|
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
||||||
|
} catch(e) {}
|
||||||
|
|
||||||
|
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||||
|
this.headers = {"User-Agent": this.userAgent};
|
||||||
|
|
||||||
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
||||||
|
|
||||||
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
||||||
|
@ -245,7 +255,7 @@ class Crawler {
|
||||||
// Puppeter Options
|
// Puppeter Options
|
||||||
return {
|
return {
|
||||||
headless: this.params.headless,
|
headless: this.params.headless,
|
||||||
executablePath: "/opt/google/chrome/google-chrome",
|
executablePath: CHROME_PATH,
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
args: this.chromeArgs
|
args: this.chromeArgs
|
||||||
};
|
};
|
||||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
crawler:
|
crawler:
|
||||||
image: webrecorder/browsertrix-crawler
|
image: webrecorder/browsertrix-crawler:0.1.0
|
||||||
build:
|
build:
|
||||||
context: ./
|
context: ./
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue