Add wacz support to browsertrix (#6)

* Add WACZ creation support, fixes #2 * --generateWACZ flag adds WACZ file (currently named <collection>/<collection>.wacz) * page list generated in <collection>/pages/pages.jsonl, entry for each page is appended to end of file, includes url and title of page Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-10-19 14:33:17 +00:00 · 2021-02-04 00:28:32 -05:00 · 2021-02-04 00:28:32 -05:00 · 9c139eba2b
commit 9c139eba2b
parent 789279021b
3 changed files with 78 additions and 14 deletions
--- a/6
+++ b/6
@ -14,15 +14,13 @@ ENV PROXY_HOST=localhost \
    DISPLAY=:99 \
    GEOMETRY=1360x1020x16

-RUN pip install uwsgi
-
-RUN pip install pywb>=2.5.0
-
 COPY --from=chrome /tmp/*.deb /deb/
 COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
 RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
    rm -rf /var/lib/opts/lists/*

+RUN pip install pywb>=2.5.0 uwsgi wacz
+
 WORKDIR /app

 ADD package.json /app/
--- a/crawler.js
+++ b/crawler.js
@ -5,7 +5,8 @@ const fetch = require("node-fetch");
 const AbortController = require("abort-controller");
 const path = require("path");
 const fs = require("fs");
-const Sitemapper = require('sitemapper');
+const Sitemapper = require("sitemapper");
+const { v4: uuidv4 } = require("uuid");

 const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
 const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
@ -50,6 +51,16 @@ class Crawler {

    this.params = params;
    this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
+
+
+    // root collections dir
+    this.collDir = path.join(this.params.cwd, "collections", this.params.collection);
+
+    // pages directory
+    this.pagesDir = path.join(this.collDir, "pages");
+
+    // pages file
+    this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
  }

  configureUA() {
@ -72,7 +83,9 @@ class Crawler {

      try {
        version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
-      } catch(e) {}
+      } catch(e) {
+        console.log(e);
+      }

      this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
    }
@ -195,6 +208,12 @@ class Crawler {
        default: false,
      },
      
+      "generateWACZ": {
+        describe: "If set, generate wacz",
+        type: "boolean",
+        default: false,
+      },
+      
      "cwd": {
        describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
        type: "string",
@ -378,7 +397,8 @@ class Crawler {
    this.cluster.task(async (opts) => {
      try {
        await this.driver({...opts, crawler: this});
-
+        const title = await opts.page.title();
+        this.writePage(opts.data.url, title);
        this.writeStats();

      } catch (e) {
@ -386,6 +406,8 @@ class Crawler {
      }
    });

+    this.initPages();
+
    this.queueUrl(this.params.url);

    if (this.params.useSitemap) {
@ -406,11 +428,30 @@ class Crawler {

      child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
    }
+    
+    if (this.params.generateWACZ) {
+      console.log("Generating WACZ");
+
+      const archiveDir = path.join(this.collDir, "archive");
+
+      // Get a list of the warcs inside
+      const warcFileList = fs.readdirSync(archiveDir);
+      
+      // Build the argument list to pass to the wacz create command
+      const waczFilename = this.params.collection.concat(".wacz");
+      const waczPath = path.join(this.collDir, waczFilename);
+      const argument_list = ["create", "-o", waczPath, "--pages", this.pagesFile, "-f"];
+      warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val)));
+      
+      // Run the wacz create command
+      child_process.spawnSync("wacz" , argument_list);
+      console.log(`WACZ successfully generated and saved to: ${waczFilename}`);
+    }
  }

+
  writeStats() {
    if (this.params.statsFilename) {
-
      const total = this.cluster.allTargetCount;
      const workersRunning = this.cluster.workersBusy.length;
      const numCrawled = total - this.cluster.jobQueue.size() - workersRunning;
@ -418,7 +459,7 @@ class Crawler {
      const stats = {numCrawled, workersRunning, total, limit};

      try {
-        fs.writeFileSync(this.params.statsFilename, JSON.stringify(stats, null, 2))
+        fs.writeFileSync(this.params.statsFilename, JSON.stringify(stats, null, 2));
      } catch (err) {
        console.warn("Stats output failed", err);
      }
@ -437,7 +478,6 @@ class Crawler {
      console.warn("Link Extraction failed", e);
      return;
    }
-
    this.queueUrls(results);
  }

@ -445,7 +485,6 @@ class Crawler {
    try {
      for (const url of urls) {
        const captureUrl = this.shouldCrawl(url);
-
        if (captureUrl) {
          if (!this.queueUrl(captureUrl)) {
            break;
@ -468,6 +507,32 @@ class Crawler {
    return true;
  }

+  initPages() {
+    try {
+      // create pages dir if doesn't exist and write pages.jsonl header
+      if (!fs.existsSync(this.pagesDir)) {
+        fs.mkdirSync(this.pagesDir);
+        const header = JSON.stringify({"format": "json-pages-1.0", "id": "pages", "title": "All Pages", "hasText": false}).concat("\n");
+        fs.writeFileSync(this.pagesFile, header);
+      }
+    } catch(err) {
+      console.log("pages/pages.jsonl creation failed", err);
+    }
+  }
+
+  writePage(url, title){
+    const id = uuidv4();
+    const today = new Date();
+    const row = {"id": id, "url": url, "title": title};
+    const processedRow = JSON.stringify(row).concat("\n");
+    try {
+      fs.appendFileSync(this.pagesFile, processedRow);
+    }
+    catch (err) {
+      console.warn("pages/pages.jsonl append failed", err);
+    }
+  }
+  
  shouldCrawl(url) {
    try {
      url = new URL(url);
--- a/package.json
+++ b/package.json
@ -11,8 +11,9 @@
    "puppeteer-cluster": "^0.22.0",
    "puppeteer-core": "^5.3.1",
    "sitemapper": "^3.1.2",
-    "yargs": "^16.0.3"
-  },
+    "yargs": "^16.0.3",
+    "uuid": "8.3.2"
+},
  "devDependencies": {
    "eslint": "^7.12.1"
  }