mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Add wacz support to browsertrix (#6)
* Add WACZ creation support, fixes #2 * --generateWACZ flag adds WACZ file (currently named <collection>/<collection>.wacz) * page list generated in <collection>/pages/pages.jsonl, entry for each page is appended to end of file, includes url and title of page Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
789279021b
commit
9c139eba2b
3 changed files with 78 additions and 14 deletions
|
@ -14,15 +14,13 @@ ENV PROXY_HOST=localhost \
|
||||||
DISPLAY=:99 \
|
DISPLAY=:99 \
|
||||||
GEOMETRY=1360x1020x16
|
GEOMETRY=1360x1020x16
|
||||||
|
|
||||||
RUN pip install uwsgi
|
|
||||||
|
|
||||||
RUN pip install pywb>=2.5.0
|
|
||||||
|
|
||||||
COPY --from=chrome /tmp/*.deb /deb/
|
COPY --from=chrome /tmp/*.deb /deb/
|
||||||
COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
|
COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
|
||||||
RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
|
RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
|
||||||
rm -rf /var/lib/opts/lists/*
|
rm -rf /var/lib/opts/lists/*
|
||||||
|
|
||||||
|
RUN pip install pywb>=2.5.0 uwsgi wacz
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
ADD package.json /app/
|
ADD package.json /app/
|
||||||
|
|
81
crawler.js
81
crawler.js
|
@ -5,7 +5,8 @@ const fetch = require("node-fetch");
|
||||||
const AbortController = require("abort-controller");
|
const AbortController = require("abort-controller");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const Sitemapper = require('sitemapper');
|
const Sitemapper = require("sitemapper");
|
||||||
|
const { v4: uuidv4 } = require("uuid");
|
||||||
|
|
||||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||||
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||||
|
@ -50,6 +51,16 @@ class Crawler {
|
||||||
|
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
|
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
|
||||||
|
|
||||||
|
|
||||||
|
// root collections dir
|
||||||
|
this.collDir = path.join(this.params.cwd, "collections", this.params.collection);
|
||||||
|
|
||||||
|
// pages directory
|
||||||
|
this.pagesDir = path.join(this.collDir, "pages");
|
||||||
|
|
||||||
|
// pages file
|
||||||
|
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||||
}
|
}
|
||||||
|
|
||||||
configureUA() {
|
configureUA() {
|
||||||
|
@ -72,7 +83,9 @@ class Crawler {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
||||||
} catch(e) {}
|
} catch(e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
|
||||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||||
}
|
}
|
||||||
|
@ -194,7 +207,13 @@ class Crawler {
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"generateWACZ": {
|
||||||
|
describe: "If set, generate wacz",
|
||||||
|
type: "boolean",
|
||||||
|
default: false,
|
||||||
|
},
|
||||||
|
|
||||||
"cwd": {
|
"cwd": {
|
||||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
|
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
|
||||||
type: "string",
|
type: "string",
|
||||||
|
@ -378,7 +397,8 @@ class Crawler {
|
||||||
this.cluster.task(async (opts) => {
|
this.cluster.task(async (opts) => {
|
||||||
try {
|
try {
|
||||||
await this.driver({...opts, crawler: this});
|
await this.driver({...opts, crawler: this});
|
||||||
|
const title = await opts.page.title();
|
||||||
|
this.writePage(opts.data.url, title);
|
||||||
this.writeStats();
|
this.writeStats();
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -386,6 +406,8 @@ class Crawler {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
this.initPages();
|
||||||
|
|
||||||
this.queueUrl(this.params.url);
|
this.queueUrl(this.params.url);
|
||||||
|
|
||||||
if (this.params.useSitemap) {
|
if (this.params.useSitemap) {
|
||||||
|
@ -406,11 +428,30 @@ class Crawler {
|
||||||
|
|
||||||
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.params.generateWACZ) {
|
||||||
|
console.log("Generating WACZ");
|
||||||
|
|
||||||
|
const archiveDir = path.join(this.collDir, "archive");
|
||||||
|
|
||||||
|
// Get a list of the warcs inside
|
||||||
|
const warcFileList = fs.readdirSync(archiveDir);
|
||||||
|
|
||||||
|
// Build the argument list to pass to the wacz create command
|
||||||
|
const waczFilename = this.params.collection.concat(".wacz");
|
||||||
|
const waczPath = path.join(this.collDir, waczFilename);
|
||||||
|
const argument_list = ["create", "-o", waczPath, "--pages", this.pagesFile, "-f"];
|
||||||
|
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val)));
|
||||||
|
|
||||||
|
// Run the wacz create command
|
||||||
|
child_process.spawnSync("wacz" , argument_list);
|
||||||
|
console.log(`WACZ successfully generated and saved to: ${waczFilename}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
writeStats() {
|
writeStats() {
|
||||||
if (this.params.statsFilename) {
|
if (this.params.statsFilename) {
|
||||||
|
|
||||||
const total = this.cluster.allTargetCount;
|
const total = this.cluster.allTargetCount;
|
||||||
const workersRunning = this.cluster.workersBusy.length;
|
const workersRunning = this.cluster.workersBusy.length;
|
||||||
const numCrawled = total - this.cluster.jobQueue.size() - workersRunning;
|
const numCrawled = total - this.cluster.jobQueue.size() - workersRunning;
|
||||||
|
@ -418,7 +459,7 @@ class Crawler {
|
||||||
const stats = {numCrawled, workersRunning, total, limit};
|
const stats = {numCrawled, workersRunning, total, limit};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
fs.writeFileSync(this.params.statsFilename, JSON.stringify(stats, null, 2))
|
fs.writeFileSync(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn("Stats output failed", err);
|
console.warn("Stats output failed", err);
|
||||||
}
|
}
|
||||||
|
@ -437,7 +478,6 @@ class Crawler {
|
||||||
console.warn("Link Extraction failed", e);
|
console.warn("Link Extraction failed", e);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.queueUrls(results);
|
this.queueUrls(results);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -445,7 +485,6 @@ class Crawler {
|
||||||
try {
|
try {
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
const captureUrl = this.shouldCrawl(url);
|
const captureUrl = this.shouldCrawl(url);
|
||||||
|
|
||||||
if (captureUrl) {
|
if (captureUrl) {
|
||||||
if (!this.queueUrl(captureUrl)) {
|
if (!this.queueUrl(captureUrl)) {
|
||||||
break;
|
break;
|
||||||
|
@ -468,6 +507,32 @@ class Crawler {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
initPages() {
|
||||||
|
try {
|
||||||
|
// create pages dir if doesn't exist and write pages.jsonl header
|
||||||
|
if (!fs.existsSync(this.pagesDir)) {
|
||||||
|
fs.mkdirSync(this.pagesDir);
|
||||||
|
const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n");
|
||||||
|
fs.writeFileSync(this.pagesFile, header);
|
||||||
|
}
|
||||||
|
} catch(err) {
|
||||||
|
console.log("pages/pages.jsonl creation failed", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writePage(url, title){
|
||||||
|
const id = uuidv4();
|
||||||
|
const today = new Date();
|
||||||
|
const row = {"id": id, "url": url, "title": title};
|
||||||
|
const processedRow = JSON.stringify(row).concat("\n");
|
||||||
|
try {
|
||||||
|
fs.appendFileSync(this.pagesFile, processedRow);
|
||||||
|
}
|
||||||
|
catch (err) {
|
||||||
|
console.warn("pages/pages.jsonl append failed", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
shouldCrawl(url) {
|
shouldCrawl(url) {
|
||||||
try {
|
try {
|
||||||
url = new URL(url);
|
url = new URL(url);
|
||||||
|
|
|
@ -11,8 +11,9 @@
|
||||||
"puppeteer-cluster": "^0.22.0",
|
"puppeteer-cluster": "^0.22.0",
|
||||||
"puppeteer-core": "^5.3.1",
|
"puppeteer-core": "^5.3.1",
|
||||||
"sitemapper": "^3.1.2",
|
"sitemapper": "^3.1.2",
|
||||||
"yargs": "^16.0.3"
|
"yargs": "^16.0.3",
|
||||||
},
|
"uuid": "8.3.2"
|
||||||
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"eslint": "^7.12.1"
|
"eslint": "^7.12.1"
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue