mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
factor out behaviors to browsertrix-behaviors: (#32)
- inject built 'behaviors.js' from browsertrix-behaviors, init with options and run - remove bgbehaviors - move textextract to root for now - add requirements.txt for python dependencies - remove obsolete --scroll option, to part of the behaviors system logging: - configure logging options via --logging param, can include 'stats' (default), 'pywb', 'behaviors', and 'behaviors-debug' - inject custom logging function for behaviors to call if either behaviors or behaviors-debug is set - 'behaviors-debug' prints all debug messages from behaviors, while regular 'behaviors' prints main behavior messages (useful for verification) dockerfile: add 'rebuild' arg to faciliate rebuilding image from specific step bump to 0.3.0-beta.0
This commit is contained in:
parent
9ef3f25416
commit
bc7f1badf3
12 changed files with 3541 additions and 4042 deletions
11
Dockerfile
11
Dockerfile
|
@ -4,6 +4,8 @@ FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome
|
|||
|
||||
FROM nikolaik/python-nodejs:python3.8-nodejs14
|
||||
|
||||
RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install --no-install-recommends -qqy fonts-stix locales-all redis-server xvfb \
|
||||
&& apt-get clean \
|
||||
|
@ -24,18 +26,21 @@ COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
|
|||
RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
|
||||
rm -rf /var/lib/opts/lists/*
|
||||
|
||||
RUN pip install pywb>=2.5.0 uwsgi wacz
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt /app/
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
ADD package.json /app/
|
||||
|
||||
# to allow forcing rebuilds from this stage
|
||||
ARG REBUILD
|
||||
|
||||
RUN yarn install
|
||||
|
||||
ADD config.yaml /app/
|
||||
ADD uwsgi.ini /app/
|
||||
ADD *.js /app/
|
||||
ADD behaviors/ /app/behaviors/
|
||||
|
||||
RUN ln -s /app/main.js /usr/bin/crawl
|
||||
|
||||
|
|
|
@ -1,44 +0,0 @@
|
|||
const AutoPlayBehavior = require("./global/autoplay");
|
||||
|
||||
const AutoFetchBehavior = require("./global/autofetcher");
|
||||
|
||||
const AutoScrollBehavior = require("./global/autoscroll");
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class BackgroundBehaviors
|
||||
{
|
||||
constructor(bgbehaviors) {
|
||||
this.doAutoFetch = bgbehaviors.includes("auto-fetch");
|
||||
this.doAutoPlay = bgbehaviors.includes("auto-play");
|
||||
this.doAutoScroll = bgbehaviors.includes("auto-scroll");
|
||||
}
|
||||
|
||||
async setup(page, crawler) {
|
||||
const behaviors = [];
|
||||
|
||||
try {
|
||||
if (this.doAutoFetch) {
|
||||
behaviors.push(new AutoFetchBehavior());
|
||||
}
|
||||
|
||||
if (this.doAutoPlay) {
|
||||
behaviors.push(new AutoPlayBehavior());
|
||||
}
|
||||
|
||||
if (this.doAutoScroll) {
|
||||
behaviors.push(new AutoScrollBehavior());
|
||||
}
|
||||
|
||||
await Promise.all(behaviors.map(b => b.beforeLoad(page, crawler)));
|
||||
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
|
||||
return () => Promise.all(behaviors.map(b => b.afterLoad(page, crawler)));
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = BackgroundBehaviors;
|
||||
|
|
@ -1,233 +0,0 @@
|
|||
// AutoFetcher script
|
||||
// extract and fetch all urls from srcsets, from images as well as audio/video
|
||||
// also extract any urls from media query stylesheets that have not necessarily been loaded
|
||||
// (May not work for cross-origin stylesheets)
|
||||
|
||||
function autofetcher() {
|
||||
|
||||
const SRC_SET_SELECTOR = 'img[srcset], img[data-srcset], img[data-src], ' +
|
||||
'video[srcset], video[data-srcset], video[data-src], audio[srcset], audio[data-srcset], audio[data-src], ' +
|
||||
'picture > source[srcset], picture > source[data-srcset], picture > source[data-src], ' +
|
||||
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
|
||||
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
|
||||
|
||||
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; // eslint-disable-line no-useless-escape
|
||||
|
||||
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
|
||||
|
||||
class AutoFetcher
|
||||
{
|
||||
constructor() {
|
||||
this.urlSet = new Set();
|
||||
this.urlqueue = [];
|
||||
this.numPending = 0;
|
||||
}
|
||||
|
||||
init() {
|
||||
console.log("init autofetch");
|
||||
|
||||
window.addEventListener("load", () => {
|
||||
this.run();
|
||||
this.initObserver();
|
||||
});
|
||||
}
|
||||
|
||||
async run() {
|
||||
/*eslint no-constant-condition: ["error", { "checkLoops": false }]*/
|
||||
while (true) {
|
||||
this.extractSrcSrcSetAll(document);
|
||||
this.extractStyleSheets();
|
||||
if (window.__crawler_nextPhase) {
|
||||
await window.__crawler_nextPhase();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
isValidUrl(url) {
|
||||
return url && (url.startsWith("http:") || url.startsWith("https:"));
|
||||
}
|
||||
|
||||
queueUrl(url) {
|
||||
try {
|
||||
url = new URL(url, document.baseURI).href;
|
||||
} catch (e) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.isValidUrl(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.urlSet.has(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.urlSet.add(url);
|
||||
|
||||
this.doFetch(url);
|
||||
}
|
||||
|
||||
async doFetch(url) {
|
||||
this.urlqueue.push(url);
|
||||
if (this.numPending <= 6) {
|
||||
while (this.urlqueue.length > 0) {
|
||||
const url = this.urlqueue.shift();
|
||||
try {
|
||||
this.numPending++;
|
||||
console.log("AutoFetching: " + url);
|
||||
const resp = await fetch(url);
|
||||
await resp.blob();
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
}
|
||||
this.numPending--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
initObserver() {
|
||||
this.mutobz = new MutationObserver((changes) => this.observeChange(changes));
|
||||
|
||||
this.mutobz.observe(document.documentElement, {
|
||||
characterData: false,
|
||||
characterDataOldValue: false,
|
||||
attributes: true,
|
||||
attributeOldValue: true,
|
||||
subtree: true,
|
||||
childList: true,
|
||||
attributeFilter: ['srcset']
|
||||
});
|
||||
}
|
||||
|
||||
processChangedNode(target) {
|
||||
switch (target.nodeType) {
|
||||
case Node.ATTRIBUTE_NODE:
|
||||
if (target.nodeName === "srcset") {
|
||||
this.extractSrcSetAttr(target.nodeValue);
|
||||
}
|
||||
break;
|
||||
|
||||
case Node.TEXT_NODE:
|
||||
if (target.parentNode && target.parentNode.tagName === "STYLE") {
|
||||
this.extractStyleText(target.nodeValue);
|
||||
}
|
||||
break;
|
||||
|
||||
case Node.ELEMENT_NODE:
|
||||
if (target.sheet) {
|
||||
this.extractStyleSheet(target.sheet);
|
||||
}
|
||||
this.extractSrcSrcSet(target);
|
||||
setTimeout(() => this.extractSrcSrcSetAll(target), 1000);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
observeChange(changes) {
|
||||
for (const change of changes) {
|
||||
this.processChangedNode(change.target);
|
||||
|
||||
if (change.type === "childList") {
|
||||
for (const node of change.addedNodes) {
|
||||
this.processChangedNode(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSrcSetAll(root) {
|
||||
const elems = root.querySelectorAll(SRC_SET_SELECTOR);
|
||||
|
||||
for (const elem of elems) {
|
||||
//console.log(elem);
|
||||
this.extractSrcSrcSet(elem);
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSrcSet(elem) {
|
||||
if (!elem || elem.nodeType !== Node.ELEMENT_NODE) {
|
||||
console.warn("No elem to extract from");
|
||||
return;
|
||||
}
|
||||
|
||||
const src = elem.src || elem.getAttribute("data-src");
|
||||
|
||||
if (src) {
|
||||
this.queueUrl(src);
|
||||
}
|
||||
|
||||
const srcset = elem.srcset || elem.getAttribute("data-srcset");
|
||||
|
||||
if (srcset) {
|
||||
this.extractSrcSetAttr(srcset);
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSetAttr(srcset) {
|
||||
for (const v of srcset.split(SRCSET_REGEX)) {
|
||||
if (v) {
|
||||
const parts = v.trim().split(" ");
|
||||
this.queueUrl(parts[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleSheets(root) {
|
||||
root = root || document;
|
||||
|
||||
for (const sheet of root.styleSheets) {
|
||||
this.extractStyleSheet(sheet);
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleSheet(sheet) {
|
||||
let rules;
|
||||
|
||||
try {
|
||||
rules = sheet.cssRules || sheet.rules;
|
||||
} catch (e) {
|
||||
console.log("Can't access stylesheet");
|
||||
return;
|
||||
}
|
||||
|
||||
for (const rule of rules) {
|
||||
if (rule.type === CSSRule.MEDIA_RULE) {
|
||||
this.extractStyleText(rule.cssText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleText(text) {
|
||||
const urlExtractor = (m, n1, n2, n3) => {
|
||||
this.queueUrl(n2);
|
||||
return n1 + n2 + n3;
|
||||
};
|
||||
|
||||
text.replace(STYLE_REGEX, urlExtractor).replace(IMPORT_REGEX, urlExtractor);
|
||||
}
|
||||
}
|
||||
|
||||
new AutoFetcher().init();
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class AutoFetchBehavior
|
||||
{
|
||||
async beforeLoad(page) {
|
||||
const iife = `(${autofetcher.toString()})();`;
|
||||
await page.evaluateOnNewDocument(iife);
|
||||
}
|
||||
|
||||
async afterLoad() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
module.exports = AutoFetchBehavior;
|
||||
|
|
@ -1,139 +0,0 @@
|
|||
// ===========================================================================
|
||||
function autoplay() {
|
||||
function run() {
|
||||
if (self.navigator.__crawler_autoplay) {
|
||||
return;
|
||||
}
|
||||
|
||||
function loadAutoplay(url) {
|
||||
if (self.__crawler_autoplayLoad) {
|
||||
self.__crawler_autoplayLoad(url);
|
||||
}
|
||||
// delay to allow splash image to load
|
||||
setTimeout(() => self.location.href = url, 1000);
|
||||
}
|
||||
|
||||
//console.log("checking autoplay for " + document.location.href);
|
||||
self.navigator.__crawler_autoplay = true;
|
||||
|
||||
const specialActions = [
|
||||
{
|
||||
rx: /w\.soundcloud\.com/,
|
||||
check(url) {
|
||||
const autoplay = url.searchParams.get('auto_play');
|
||||
return autoplay === 'true';
|
||||
},
|
||||
handle(url) {
|
||||
url.searchParams.set('auto_play', 'true');
|
||||
// set continuous_play to true in order to handle
|
||||
// a playlist etc
|
||||
url.searchParams.set('continuous_play', 'true');
|
||||
loadAutoplay(url.href);
|
||||
},
|
||||
},
|
||||
{
|
||||
rx: [/player\.vimeo\.com/, /youtube(?:-nocookie)?\.com\/embed\//],
|
||||
check(url) {
|
||||
const autoplay = url.searchParams.get('autoplay');
|
||||
return autoplay === '1';
|
||||
},
|
||||
handle(url) {
|
||||
url.searchParams.set('autoplay', '1');
|
||||
loadAutoplay(url.href);
|
||||
},
|
||||
},
|
||||
];
|
||||
const url = new URL(self.location.href);
|
||||
for (let i = 0; i < specialActions.length; i++) {
|
||||
if (Array.isArray(specialActions[i].rx)) {
|
||||
const rxs = specialActions[i].rx;
|
||||
for (let j = 0; j < rxs.length; j++) {
|
||||
if (url.href.search(rxs[j]) >= 0) {
|
||||
if (specialActions[i].check(url)) return;
|
||||
return specialActions[i].handle(url);
|
||||
}
|
||||
}
|
||||
} else if (url.href.search(specialActions[i].rx) >= 0) {
|
||||
if (specialActions[i].check(url)) return;
|
||||
return specialActions[i].handle(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.document.addEventListener("readystatechange", run);
|
||||
|
||||
if (self.document.readyState === "complete") {
|
||||
run();
|
||||
}
|
||||
|
||||
|
||||
const mediaSet = new Set();
|
||||
|
||||
setInterval(() => {
|
||||
const medias = self.document.querySelectorAll("video, audio");
|
||||
|
||||
for (const media of medias) {
|
||||
try {
|
||||
if (media.src && !mediaSet.has(media.src)) {
|
||||
if (self.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) {
|
||||
self.__crawler_queueUrls(media.src);
|
||||
}
|
||||
mediaSet.add(media.src);
|
||||
} else if (!media.src) {
|
||||
media.play();
|
||||
}
|
||||
} catch(e) {
|
||||
console.log(e);
|
||||
}
|
||||
}
|
||||
}, 3000);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class AutoPlayBehavior
|
||||
{
|
||||
constructor() {
|
||||
this.mediaPromises = [];
|
||||
this.waitForVideo = false;
|
||||
}
|
||||
|
||||
async beforeLoad(page, crawler) {
|
||||
try {
|
||||
await page.exposeFunction("__crawler_queueUrls", async (url) => {
|
||||
this.mediaPromises.push(crawler.directFetchCapture(url));
|
||||
});
|
||||
|
||||
await page.exposeFunction("__crawler_autoplayLoad", (url) => {
|
||||
console.log("*** Loading autoplay URL: " + url);
|
||||
this.waitForVideo = true;
|
||||
});
|
||||
|
||||
const iife = `(${autoplay.toString()})();`;
|
||||
await page.evaluateOnNewDocument(iife);
|
||||
|
||||
} catch(err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
|
||||
async afterLoad(page, crawler) {
|
||||
try {
|
||||
await Promise.all(this.mediaPromises);
|
||||
} catch (e) {
|
||||
console.log("Error loading media URLs", e);
|
||||
}
|
||||
|
||||
if (this.waitForVideo) {
|
||||
console.log("Extra wait 15s for video loading");
|
||||
await crawler.sleep(15000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
module.exports = AutoPlayBehavior
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
async function autoScroll() {
|
||||
const canScrollMore = () =>
|
||||
self.scrollY + self.innerHeight <
|
||||
Math.max(
|
||||
self.document.body.scrollHeight,
|
||||
self.document.body.offsetHeight,
|
||||
self.document.documentElement.clientHeight,
|
||||
self.document.documentElement.scrollHeight,
|
||||
self.document.documentElement.offsetHeight
|
||||
);
|
||||
|
||||
const scrollOpts = { top: 250, left: 0, behavior: "auto" };
|
||||
|
||||
while (canScrollMore()) {
|
||||
self.scrollBy(scrollOpts);
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class AutoScrollBehavior
|
||||
{
|
||||
|
||||
async beforeLoad() {
|
||||
}
|
||||
|
||||
async afterLoad(page, crawler) {
|
||||
try {
|
||||
await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
|
||||
} catch (e) {
|
||||
console.warn("Autoscroll Behavior Failed", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = AutoScrollBehavior;
|
82
crawler.js
82
crawler.js
|
@ -7,13 +7,15 @@ const path = require("path");
|
|||
const fs = require("fs");
|
||||
const Sitemapper = require("sitemapper");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const TextExtract = require("./behaviors/global/textextract");
|
||||
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
|
||||
|
||||
const TextExtract = require("./textextract");
|
||||
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
|
||||
|
||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
|
||||
const BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||
|
||||
const CHROME_PATH = "google-chrome";
|
||||
|
||||
// to ignore HTTPS error for HEAD check
|
||||
|
@ -39,10 +41,8 @@ class Crawler {
|
|||
// was the limit hit?
|
||||
this.limitHit = false;
|
||||
|
||||
this.monitor = true;
|
||||
|
||||
this.userAgent = "";
|
||||
this.headers = {};
|
||||
this.behaviorsLogDebug = false;
|
||||
|
||||
const params = require("yargs")
|
||||
.usage("browsertrix-crawler [options]")
|
||||
|
@ -64,9 +64,6 @@ class Crawler {
|
|||
|
||||
// pages file
|
||||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
// background behaviors
|
||||
this.bgbehaviors = new BackgroundBehaviors(this.params.bgbehaviors || []);
|
||||
}
|
||||
|
||||
configureUA() {
|
||||
|
@ -108,7 +105,7 @@ class Crawler {
|
|||
|
||||
bootstrap() {
|
||||
let opts = {}
|
||||
if (this.params.pywb_log) {
|
||||
if (this.params.logging.includes("pywb")) {
|
||||
opts = {stdio: "inherit", cwd: this.params.cwd};
|
||||
}
|
||||
else{
|
||||
|
@ -189,12 +186,6 @@ class Crawler {
|
|||
describe: "Regex of page URLs that should be excluded from the crawl."
|
||||
},
|
||||
|
||||
"scroll": {
|
||||
describe: "If set, will autoscroll to bottom of the page",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"collection": {
|
||||
alias: "c",
|
||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
|
@ -228,12 +219,12 @@ class Crawler {
|
|||
default: false,
|
||||
},
|
||||
|
||||
"pywb-log": {
|
||||
describe: "If set, generate pywb log file",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
"logging": {
|
||||
describe: "Logging options for crawler, can include: stats, pywb, behaviors",
|
||||
type: "string",
|
||||
default: "stats",
|
||||
},
|
||||
|
||||
|
||||
"text": {
|
||||
describe: "If set, extract text to the pages.jsonl file",
|
||||
type: "boolean",
|
||||
|
@ -269,9 +260,9 @@ class Crawler {
|
|||
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
|
||||
},
|
||||
|
||||
"bgbehaviors": {
|
||||
"behaviors": {
|
||||
describe: "Which background behaviors to enable on each page",
|
||||
default: "auto-play,auto-fetch",
|
||||
default: "autoplay,autofetch,siteSpecific",
|
||||
type: "string",
|
||||
},
|
||||
};
|
||||
|
@ -313,8 +304,19 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
// log options
|
||||
argv.logging = argv.logging.split(",");
|
||||
|
||||
// background behaviors to apply
|
||||
argv.bgbehaviors = argv.bgbehaviors.split(",");
|
||||
const behaviorOpts = {};
|
||||
argv.behaviors.split(",").forEach((x) => behaviorOpts[x] = true);
|
||||
if (argv.logging.includes("behaviors")) {
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
} else if (argv.logging.includes("behaviors-debug")) {
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
this.behaviorsLogDebug = true;
|
||||
}
|
||||
this.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||
|
||||
if (!argv.newContext) {
|
||||
argv.newContext = "page";
|
||||
|
@ -421,15 +423,33 @@ class Crawler {
|
|||
await page.emulate(this.emulateDevice);
|
||||
}
|
||||
|
||||
const bgbehavior = await this.bgbehaviors.setup(page, this);
|
||||
if (this.behaviorOpts) {
|
||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, ({data, type}) => {
|
||||
switch (type) {
|
||||
case "info":
|
||||
console.log(JSON.stringify(data));
|
||||
break;
|
||||
|
||||
case "debug":
|
||||
default:
|
||||
if (this.behaviorsLogDebug) {
|
||||
console.log("behavior debug: " + JSON.stringify(data));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
await page.evaluateOnNewDocument(behaviors + `
|
||||
self.__bx_behaviors.init(${this.behaviorOpts});
|
||||
`);
|
||||
}
|
||||
|
||||
// run custom driver here
|
||||
await this.driver({page, data, crawler: this});
|
||||
|
||||
|
||||
const title = await page.title();
|
||||
var text = ''
|
||||
if (this.params.text){
|
||||
let text = '';
|
||||
if (this.params.text) {
|
||||
const client = await page.target().createCDPSession();
|
||||
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||
text = await new TextExtract(result).parseTextFromDom();
|
||||
|
@ -437,8 +457,8 @@ class Crawler {
|
|||
|
||||
this.writePage(data.url, title, this.params.text, text);
|
||||
|
||||
if (bgbehavior) {
|
||||
await bgbehavior();
|
||||
if (this.behaviorOpts) {
|
||||
await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();")));
|
||||
}
|
||||
|
||||
this.writeStats();
|
||||
|
@ -464,7 +484,7 @@ class Crawler {
|
|||
timeout: this.params.timeout * 2,
|
||||
puppeteerOptions: this.puppeteerArgs,
|
||||
puppeteer,
|
||||
monitor: this.monitor
|
||||
monitor: this.params.logging.includes("stats")
|
||||
});
|
||||
|
||||
this.cluster.task((opts) => this.crawlPage(opts));
|
||||
|
@ -669,8 +689,8 @@ class Crawler {
|
|||
});
|
||||
|
||||
if (resp.status >= 400) {
|
||||
console.log(`Skipping ${url}, invalid status ${resp.status}`);
|
||||
return false;
|
||||
console.log(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
const contentType = resp.headers.get("Content-Type");
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
//const autoplayScript = require("/app/autoplay.js");
|
||||
|
||||
/* eslint-disable no-undef */
|
||||
|
||||
module.exports = async ({data, page, crawler}) => {
|
||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
|||
|
||||
services:
|
||||
crawler:
|
||||
image: webrecorder/browsertrix-crawler:0.2.1-beta.0
|
||||
image: webrecorder/browsertrix-crawler:0.3.0-beta.0
|
||||
build:
|
||||
context: ./
|
||||
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.2.1-beta.0",
|
||||
"version": "0.3.0-beta.0",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"abort-controller": "^3.0.0",
|
||||
"browsertrix-behaviors": "github:webrecorder/browsertrix-behaviors",
|
||||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "^0.22.0",
|
||||
"puppeteer-core": "^5.3.1",
|
||||
|
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
pywb>=2.5.0
|
||||
uwsgi
|
||||
wacz>=0.2.1
|
Loading…
Add table
Add a link
Reference in a new issue