Add wacz support to browsertrix (#6)

* Add WACZ creation support, fixes #2
* --generateWACZ flag adds WACZ file (currently named <collection>/<collection>.wacz)
* page list generated in <collection>/pages/pages.jsonl, entry for each page is appended to end of file, includes url and title of page

Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Emma Dickson 2021-02-04 00:28:32 -05:00 committed by GitHub
parent 789279021b
commit 9c139eba2b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 78 additions and 14 deletions

View file

@ -14,15 +14,13 @@ ENV PROXY_HOST=localhost \
DISPLAY=:99 \ DISPLAY=:99 \
GEOMETRY=1360x1020x16 GEOMETRY=1360x1020x16
RUN pip install uwsgi
RUN pip install pywb>=2.5.0
COPY --from=chrome /tmp/*.deb /deb/ COPY --from=chrome /tmp/*.deb /deb/
COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \ RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
rm -rf /var/lib/opts/lists/* rm -rf /var/lib/opts/lists/*
RUN pip install pywb>=2.5.0 uwsgi wacz
WORKDIR /app WORKDIR /app
ADD package.json /app/ ADD package.json /app/

View file

@ -5,7 +5,8 @@ const fetch = require("node-fetch");
const AbortController = require("abort-controller"); const AbortController = require("abort-controller");
const path = require("path"); const path = require("path");
const fs = require("fs"); const fs = require("fs");
const Sitemapper = require('sitemapper'); const Sitemapper = require("sitemapper");
const { v4: uuidv4 } = require("uuid");
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
@ -50,6 +51,16 @@ class Crawler {
this.params = params; this.params = params;
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`; this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
// root collections dir
this.collDir = path.join(this.params.cwd, "collections", this.params.collection);
// pages directory
this.pagesDir = path.join(this.collDir, "pages");
// pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
} }
configureUA() { configureUA() {
@ -72,7 +83,9 @@ class Crawler {
try { try {
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim(); version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
} catch(e) {} } catch(e) {
console.log(e);
}
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
} }
@ -194,7 +207,13 @@ class Crawler {
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"generateWACZ": {
describe: "If set, generate wacz",
type: "boolean",
default: false,
},
"cwd": { "cwd": {
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd", describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
type: "string", type: "string",
@ -378,7 +397,8 @@ class Crawler {
this.cluster.task(async (opts) => { this.cluster.task(async (opts) => {
try { try {
await this.driver({...opts, crawler: this}); await this.driver({...opts, crawler: this});
const title = await opts.page.title();
this.writePage(opts.data.url, title);
this.writeStats(); this.writeStats();
} catch (e) { } catch (e) {
@ -386,6 +406,8 @@ class Crawler {
} }
}); });
this.initPages();
this.queueUrl(this.params.url); this.queueUrl(this.params.url);
if (this.params.useSitemap) { if (this.params.useSitemap) {
@ -406,11 +428,30 @@ class Crawler {
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd}); child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
} }
if (this.params.generateWACZ) {
console.log("Generating WACZ");
const archiveDir = path.join(this.collDir, "archive");
// Get a list of the warcs inside
const warcFileList = fs.readdirSync(archiveDir);
// Build the argument list to pass to the wacz create command
const waczFilename = this.params.collection.concat(".wacz");
const waczPath = path.join(this.collDir, waczFilename);
const argument_list = ["create", "-o", waczPath, "--pages", this.pagesFile, "-f"];
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val)));
// Run the wacz create command
child_process.spawnSync("wacz" , argument_list);
console.log(`WACZ successfully generated and saved to: ${waczFilename}`);
}
} }
writeStats() { writeStats() {
if (this.params.statsFilename) { if (this.params.statsFilename) {
const total = this.cluster.allTargetCount; const total = this.cluster.allTargetCount;
const workersRunning = this.cluster.workersBusy.length; const workersRunning = this.cluster.workersBusy.length;
const numCrawled = total - this.cluster.jobQueue.size() - workersRunning; const numCrawled = total - this.cluster.jobQueue.size() - workersRunning;
@ -418,7 +459,7 @@ class Crawler {
const stats = {numCrawled, workersRunning, total, limit}; const stats = {numCrawled, workersRunning, total, limit};
try { try {
fs.writeFileSync(this.params.statsFilename, JSON.stringify(stats, null, 2)) fs.writeFileSync(this.params.statsFilename, JSON.stringify(stats, null, 2));
} catch (err) { } catch (err) {
console.warn("Stats output failed", err); console.warn("Stats output failed", err);
} }
@ -437,7 +478,6 @@ class Crawler {
console.warn("Link Extraction failed", e); console.warn("Link Extraction failed", e);
return; return;
} }
this.queueUrls(results); this.queueUrls(results);
} }
@ -445,7 +485,6 @@ class Crawler {
try { try {
for (const url of urls) { for (const url of urls) {
const captureUrl = this.shouldCrawl(url); const captureUrl = this.shouldCrawl(url);
if (captureUrl) { if (captureUrl) {
if (!this.queueUrl(captureUrl)) { if (!this.queueUrl(captureUrl)) {
break; break;
@ -468,6 +507,32 @@ class Crawler {
return true; return true;
} }
initPages() {
try {
// create pages dir if doesn't exist and write pages.jsonl header
if (!fs.existsSync(this.pagesDir)) {
fs.mkdirSync(this.pagesDir);
const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n");
fs.writeFileSync(this.pagesFile, header);
}
} catch(err) {
console.log("pages/pages.jsonl creation failed", err);
}
}
writePage(url, title){
const id = uuidv4();
const today = new Date();
const row = {"id": id, "url": url, "title": title};
const processedRow = JSON.stringify(row).concat("\n");
try {
fs.appendFileSync(this.pagesFile, processedRow);
}
catch (err) {
console.warn("pages/pages.jsonl append failed", err);
}
}
shouldCrawl(url) { shouldCrawl(url) {
try { try {
url = new URL(url); url = new URL(url);

View file

@ -11,8 +11,9 @@
"puppeteer-cluster": "^0.22.0", "puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1", "puppeteer-core": "^5.3.1",
"sitemapper": "^3.1.2", "sitemapper": "^3.1.2",
"yargs": "^16.0.3" "yargs": "^16.0.3",
}, "uuid": "8.3.2"
},
"devDependencies": { "devDependencies": {
"eslint": "^7.12.1" "eslint": "^7.12.1"
} }