factor out behaviors to browsertrix-behaviors: (#32)

- inject built 'behaviors.js' from browsertrix-behaviors, init with options and run
- remove bgbehaviors
- move textextract to root for now
- add requirements.txt for python dependencies
- remove obsolete --scroll option, to part of the behaviors system

logging:
- configure logging options via --logging param, can include 'stats' (default), 'pywb', 'behaviors', and 'behaviors-debug'
- inject custom logging function for behaviors to call if either behaviors or behaviors-debug is set
- 'behaviors-debug' prints all debug messages from behaviors, while regular 'behaviors' prints main behavior messages (useful for verification)

dockerfile: add 'rebuild' arg to faciliate rebuilding image from specific step

bump to 0.3.0-beta.0
This commit is contained in:
Ilya Kreymer 2021-03-13 16:48:31 -08:00 committed by GitHub
parent 9ef3f25416
commit bc7f1badf3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 3541 additions and 4042 deletions

View file

@ -4,6 +4,8 @@ FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome
FROM nikolaik/python-nodejs:python3.8-nodejs14
RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
RUN apt-get update -y \
&& apt-get install --no-install-recommends -qqy fonts-stix locales-all redis-server xvfb \
&& apt-get clean \
@ -24,18 +26,21 @@ COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
rm -rf /var/lib/opts/lists/*
RUN pip install pywb>=2.5.0 uwsgi wacz
WORKDIR /app
ADD requirements.txt /app/
RUN pip install -r requirements.txt
ADD package.json /app/
# to allow forcing rebuilds from this stage
ARG REBUILD
RUN yarn install
ADD config.yaml /app/
ADD uwsgi.ini /app/
ADD *.js /app/
ADD behaviors/ /app/behaviors/
RUN ln -s /app/main.js /usr/bin/crawl

View file

@ -1,44 +0,0 @@
const AutoPlayBehavior = require("./global/autoplay");
const AutoFetchBehavior = require("./global/autofetcher");
const AutoScrollBehavior = require("./global/autoscroll");
// ===========================================================================
class BackgroundBehaviors
{
constructor(bgbehaviors) {
this.doAutoFetch = bgbehaviors.includes("auto-fetch");
this.doAutoPlay = bgbehaviors.includes("auto-play");
this.doAutoScroll = bgbehaviors.includes("auto-scroll");
}
async setup(page, crawler) {
const behaviors = [];
try {
if (this.doAutoFetch) {
behaviors.push(new AutoFetchBehavior());
}
if (this.doAutoPlay) {
behaviors.push(new AutoPlayBehavior());
}
if (this.doAutoScroll) {
behaviors.push(new AutoScrollBehavior());
}
await Promise.all(behaviors.map(b => b.beforeLoad(page, crawler)));
} catch (err) {
console.log(err);
}
return () => Promise.all(behaviors.map(b => b.afterLoad(page, crawler)));
}
}
module.exports = BackgroundBehaviors;

View file

@ -1,233 +0,0 @@
// AutoFetcher script
// extract and fetch all urls from srcsets, from images as well as audio/video
// also extract any urls from media query stylesheets that have not necessarily been loaded
// (May not work for cross-origin stylesheets)
function autofetcher() {
const SRC_SET_SELECTOR = 'img[srcset], img[data-srcset], img[data-src], ' +
'video[srcset], video[data-srcset], video[data-src], audio[srcset], audio[data-srcset], audio[data-src], ' +
'picture > source[srcset], picture > source[data-srcset], picture > source[data-src], ' +
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; // eslint-disable-line no-useless-escape
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
class AutoFetcher
{
constructor() {
this.urlSet = new Set();
this.urlqueue = [];
this.numPending = 0;
}
init() {
console.log("init autofetch");
window.addEventListener("load", () => {
this.run();
this.initObserver();
});
}
async run() {
/*eslint no-constant-condition: ["error", { "checkLoops": false }]*/
while (true) {
this.extractSrcSrcSetAll(document);
this.extractStyleSheets();
if (window.__crawler_nextPhase) {
await window.__crawler_nextPhase();
} else {
break;
}
}
}
isValidUrl(url) {
return url && (url.startsWith("http:") || url.startsWith("https:"));
}
queueUrl(url) {
try {
url = new URL(url, document.baseURI).href;
} catch (e) {
return;
}
if (!this.isValidUrl(url)) {
return;
}
if (this.urlSet.has(url)) {
return;
}
this.urlSet.add(url);
this.doFetch(url);
}
async doFetch(url) {
this.urlqueue.push(url);
if (this.numPending <= 6) {
while (this.urlqueue.length > 0) {
const url = this.urlqueue.shift();
try {
this.numPending++;
console.log("AutoFetching: " + url);
const resp = await fetch(url);
await resp.blob();
} catch (e) {
console.log(e)
}
this.numPending--;
}
}
}
initObserver() {
this.mutobz = new MutationObserver((changes) => this.observeChange(changes));
this.mutobz.observe(document.documentElement, {
characterData: false,
characterDataOldValue: false,
attributes: true,
attributeOldValue: true,
subtree: true,
childList: true,
attributeFilter: ['srcset']
});
}
processChangedNode(target) {
switch (target.nodeType) {
case Node.ATTRIBUTE_NODE:
if (target.nodeName === "srcset") {
this.extractSrcSetAttr(target.nodeValue);
}
break;
case Node.TEXT_NODE:
if (target.parentNode && target.parentNode.tagName === "STYLE") {
this.extractStyleText(target.nodeValue);
}
break;
case Node.ELEMENT_NODE:
if (target.sheet) {
this.extractStyleSheet(target.sheet);
}
this.extractSrcSrcSet(target);
setTimeout(() => this.extractSrcSrcSetAll(target), 1000);
break;
}
}
observeChange(changes) {
for (const change of changes) {
this.processChangedNode(change.target);
if (change.type === "childList") {
for (const node of change.addedNodes) {
this.processChangedNode(node);
}
}
}
}
extractSrcSrcSetAll(root) {
const elems = root.querySelectorAll(SRC_SET_SELECTOR);
for (const elem of elems) {
//console.log(elem);
this.extractSrcSrcSet(elem);
}
}
extractSrcSrcSet(elem) {
if (!elem || elem.nodeType !== Node.ELEMENT_NODE) {
console.warn("No elem to extract from");
return;
}
const src = elem.src || elem.getAttribute("data-src");
if (src) {
this.queueUrl(src);
}
const srcset = elem.srcset || elem.getAttribute("data-srcset");
if (srcset) {
this.extractSrcSetAttr(srcset);
}
}
extractSrcSetAttr(srcset) {
for (const v of srcset.split(SRCSET_REGEX)) {
if (v) {
const parts = v.trim().split(" ");
this.queueUrl(parts[0]);
}
}
}
extractStyleSheets(root) {
root = root || document;
for (const sheet of root.styleSheets) {
this.extractStyleSheet(sheet);
}
}
extractStyleSheet(sheet) {
let rules;
try {
rules = sheet.cssRules || sheet.rules;
} catch (e) {
console.log("Can't access stylesheet");
return;
}
for (const rule of rules) {
if (rule.type === CSSRule.MEDIA_RULE) {
this.extractStyleText(rule.cssText);
}
}
}
extractStyleText(text) {
const urlExtractor = (m, n1, n2, n3) => {
this.queueUrl(n2);
return n1 + n2 + n3;
};
text.replace(STYLE_REGEX, urlExtractor).replace(IMPORT_REGEX, urlExtractor);
}
}
new AutoFetcher().init();
}
// ===========================================================================
class AutoFetchBehavior
{
async beforeLoad(page) {
const iife = `(${autofetcher.toString()})();`;
await page.evaluateOnNewDocument(iife);
}
async afterLoad() {
}
}
module.exports = AutoFetchBehavior;

View file

@ -1,139 +0,0 @@
// ===========================================================================
function autoplay() {
function run() {
if (self.navigator.__crawler_autoplay) {
return;
}
function loadAutoplay(url) {
if (self.__crawler_autoplayLoad) {
self.__crawler_autoplayLoad(url);
}
// delay to allow splash image to load
setTimeout(() => self.location.href = url, 1000);
}
//console.log("checking autoplay for " + document.location.href);
self.navigator.__crawler_autoplay = true;
const specialActions = [
{
rx: /w\.soundcloud\.com/,
check(url) {
const autoplay = url.searchParams.get('auto_play');
return autoplay === 'true';
},
handle(url) {
url.searchParams.set('auto_play', 'true');
// set continuous_play to true in order to handle
// a playlist etc
url.searchParams.set('continuous_play', 'true');
loadAutoplay(url.href);
},
},
{
rx: [/player\.vimeo\.com/, /youtube(?:-nocookie)?\.com\/embed\//],
check(url) {
const autoplay = url.searchParams.get('autoplay');
return autoplay === '1';
},
handle(url) {
url.searchParams.set('autoplay', '1');
loadAutoplay(url.href);
},
},
];
const url = new URL(self.location.href);
for (let i = 0; i < specialActions.length; i++) {
if (Array.isArray(specialActions[i].rx)) {
const rxs = specialActions[i].rx;
for (let j = 0; j < rxs.length; j++) {
if (url.href.search(rxs[j]) >= 0) {
if (specialActions[i].check(url)) return;
return specialActions[i].handle(url);
}
}
} else if (url.href.search(specialActions[i].rx) >= 0) {
if (specialActions[i].check(url)) return;
return specialActions[i].handle(url);
}
}
}
self.document.addEventListener("readystatechange", run);
if (self.document.readyState === "complete") {
run();
}
const mediaSet = new Set();
setInterval(() => {
const medias = self.document.querySelectorAll("video, audio");
for (const media of medias) {
try {
if (media.src && !mediaSet.has(media.src)) {
if (self.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) {
self.__crawler_queueUrls(media.src);
}
mediaSet.add(media.src);
} else if (!media.src) {
media.play();
}
} catch(e) {
console.log(e);
}
}
}, 3000);
}
// ===========================================================================
class AutoPlayBehavior
{
constructor() {
this.mediaPromises = [];
this.waitForVideo = false;
}
async beforeLoad(page, crawler) {
try {
await page.exposeFunction("__crawler_queueUrls", async (url) => {
this.mediaPromises.push(crawler.directFetchCapture(url));
});
await page.exposeFunction("__crawler_autoplayLoad", (url) => {
console.log("*** Loading autoplay URL: " + url);
this.waitForVideo = true;
});
const iife = `(${autoplay.toString()})();`;
await page.evaluateOnNewDocument(iife);
} catch(err) {
console.log(err);
}
}
async afterLoad(page, crawler) {
try {
await Promise.all(this.mediaPromises);
} catch (e) {
console.log("Error loading media URLs", e);
}
if (this.waitForVideo) {
console.log("Extra wait 15s for video loading");
await crawler.sleep(15000);
}
}
}
module.exports = AutoPlayBehavior

View file

@ -1,37 +0,0 @@
async function autoScroll() {
const canScrollMore = () =>
self.scrollY + self.innerHeight <
Math.max(
self.document.body.scrollHeight,
self.document.body.offsetHeight,
self.document.documentElement.clientHeight,
self.document.documentElement.scrollHeight,
self.document.documentElement.offsetHeight
);
const scrollOpts = { top: 250, left: 0, behavior: "auto" };
while (canScrollMore()) {
self.scrollBy(scrollOpts);
await new Promise(resolve => setTimeout(resolve, 500));
}
}
// ===========================================================================
class AutoScrollBehavior
{
async beforeLoad() {
}
async afterLoad(page, crawler) {
try {
await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
} catch (e) {
console.warn("Autoscroll Behavior Failed", e);
}
}
}
module.exports = AutoScrollBehavior;

View file

@ -7,13 +7,15 @@ const path = require("path");
const fs = require("fs");
const Sitemapper = require("sitemapper");
const { v4: uuidv4 } = require("uuid");
const TextExtract = require("./behaviors/global/textextract");
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
const TextExtract = require("./textextract");
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
const BEHAVIOR_LOG_FUNC = "__bx_log";
const CHROME_PATH = "google-chrome";
// to ignore HTTPS error for HEAD check
@ -39,10 +41,8 @@ class Crawler {
// was the limit hit?
this.limitHit = false;
this.monitor = true;
this.userAgent = "";
this.headers = {};
this.behaviorsLogDebug = false;
const params = require("yargs")
.usage("browsertrix-crawler [options]")
@ -64,9 +64,6 @@ class Crawler {
// pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
// background behaviors
this.bgbehaviors = new BackgroundBehaviors(this.params.bgbehaviors || []);
}
configureUA() {
@ -108,7 +105,7 @@ class Crawler {
bootstrap() {
let opts = {}
if (this.params.pywb_log) {
if (this.params.logging.includes("pywb")) {
opts = {stdio: "inherit", cwd: this.params.cwd};
}
else{
@ -189,12 +186,6 @@ class Crawler {
describe: "Regex of page URLs that should be excluded from the crawl."
},
"scroll": {
describe: "If set, will autoscroll to bottom of the page",
type: "boolean",
default: false,
},
"collection": {
alias: "c",
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
@ -228,12 +219,12 @@ class Crawler {
default: false,
},
"pywb-log": {
describe: "If set, generate pywb log file",
type: "boolean",
default: false,
"logging": {
describe: "Logging options for crawler, can include: stats, pywb, behaviors",
type: "string",
default: "stats",
},
"text": {
describe: "If set, extract text to the pages.jsonl file",
type: "boolean",
@ -269,9 +260,9 @@ class Crawler {
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
},
"bgbehaviors": {
"behaviors": {
describe: "Which background behaviors to enable on each page",
default: "auto-play,auto-fetch",
default: "autoplay,autofetch,siteSpecific",
type: "string",
},
};
@ -313,8 +304,19 @@ class Crawler {
}
}
// log options
argv.logging = argv.logging.split(",");
// background behaviors to apply
argv.bgbehaviors = argv.bgbehaviors.split(",");
const behaviorOpts = {};
argv.behaviors.split(",").forEach((x) => behaviorOpts[x] = true);
if (argv.logging.includes("behaviors")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
} else if (argv.logging.includes("behaviors-debug")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
this.behaviorsLogDebug = true;
}
this.behaviorOpts = JSON.stringify(behaviorOpts);
if (!argv.newContext) {
argv.newContext = "page";
@ -421,15 +423,33 @@ class Crawler {
await page.emulate(this.emulateDevice);
}
const bgbehavior = await this.bgbehaviors.setup(page, this);
if (this.behaviorOpts) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, ({data, type}) => {
switch (type) {
case "info":
console.log(JSON.stringify(data));
break;
case "debug":
default:
if (this.behaviorsLogDebug) {
console.log("behavior debug: " + JSON.stringify(data));
}
}
});
await page.evaluateOnNewDocument(behaviors + `
self.__bx_behaviors.init(${this.behaviorOpts});
`);
}
// run custom driver here
await this.driver({page, data, crawler: this});
const title = await page.title();
var text = ''
if (this.params.text){
let text = '';
if (this.params.text) {
const client = await page.target().createCDPSession();
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
text = await new TextExtract(result).parseTextFromDom();
@ -437,8 +457,8 @@ class Crawler {
this.writePage(data.url, title, this.params.text, text);
if (bgbehavior) {
await bgbehavior();
if (this.behaviorOpts) {
await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();")));
}
this.writeStats();
@ -464,7 +484,7 @@ class Crawler {
timeout: this.params.timeout * 2,
puppeteerOptions: this.puppeteerArgs,
puppeteer,
monitor: this.monitor
monitor: this.params.logging.includes("stats")
});
this.cluster.task((opts) => this.crawlPage(opts));
@ -669,8 +689,8 @@ class Crawler {
});
if (resp.status >= 400) {
console.log(`Skipping ${url}, invalid status ${resp.status}`);
return false;
console.log(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
return true;
}
const contentType = resp.headers.get("Content-Type");

View file

@ -1,5 +1,3 @@
//const autoplayScript = require("/app/autoplay.js");
/* eslint-disable no-undef */
module.exports = async ({data, page, crawler}) => {

View file

@ -2,7 +2,7 @@ version: '3.5'
services:
crawler:
image: webrecorder/browsertrix-crawler:0.2.1-beta.0
image: webrecorder/browsertrix-crawler:0.3.0-beta.0
build:
context: ./

View file

@ -1,12 +1,13 @@
{
"name": "browsertrix-crawler",
"version": "0.2.1-beta.0",
"version": "0.3.0-beta.0",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
"license": "MIT",
"dependencies": {
"abort-controller": "^3.0.0",
"browsertrix-behaviors": "github:webrecorder/browsertrix-behaviors",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1",

3
requirements.txt Normal file
View file

@ -0,0 +1,3 @@
pywb>=2.5.0
uwsgi
wacz>=0.2.1

7027
yarn.lock

File diff suppressed because it is too large Load diff