From e7a850c380bbfaa6ff049287a48534310310f209 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Nov 2023 17:20:08 -0800 Subject: [PATCH 1/5] Apply suggestions from code review, remove commented out code Co-authored-by: Tessa Walsh --- crawler.js | 9 ++------- util/browser.js | 3 +-- util/recorder.js | 5 +---- util/reqresp.js | 1 - util/warcwriter.js | 5 +---- 5 files changed, 5 insertions(+), 18 deletions(-) diff --git a/crawler.js b/crawler.js index 41f60d21..feae2616 100644 --- a/crawler.js +++ b/crawler.js @@ -681,9 +681,8 @@ self.__bx_behaviors.selectMainBehavior(); async getInfoString() { const packageFileJSON = JSON.parse(await fsp.readFile("../app/package.json")); const warcioPackageJSON = JSON.parse(await fsp.readFile("/app/node_modules/warcio/package.json")); - const pywbVersion = "0.0";//child_process.execSync("pywb -V", {encoding: "utf8"}).trim().split(" ")[1]; - return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version} pywb ${pywbVersion})`; + return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`; } async createWARCInfo(filename) { @@ -893,7 +892,7 @@ self.__bx_behaviors.selectMainBehavior(); headless: this.params.headless, emulateDevice: this.emulateDevice, chromeOptions: { - proxy: false,//!process.env.NO_PROXY, + proxy: false, userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs() }, @@ -903,7 +902,6 @@ self.__bx_behaviors.selectMainBehavior(); } }); - //const archiveDir = path.join(this.collDir, "archive"); // -------------- // Run Crawl Here! @@ -921,9 +919,6 @@ self.__bx_behaviors.selectMainBehavior(); await this.writeStats(); - // extra wait for all resources to land into WARCs - // now happens at end of each page - // await this.awaitPendingClear(); // if crawl has been stopped, mark as final exit for post-crawl tasks if (await this.crawlState.isCrawlStopped()) { diff --git a/util/browser.js b/util/browser.js index bcfdc0ee..5dee4b20 100644 --- a/util/browser.js +++ b/util/browser.js @@ -316,7 +316,6 @@ export class Browser extends BaseBrowser for (const recorder of this.recorders) { if (recorder.swUrls.has(request.url)) { - //console.log(`*** found sw ${request.url} in recorder for worker ${recorder.workerid}`); recorder.swFrameIds.add(frameId); } @@ -327,7 +326,7 @@ export class Browser extends BaseBrowser } if (!foundRecorder) { - logger.warn("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder"); + logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder"); try { await this.firstCDP.send("Fetch.continueResponse", {requestId}); diff --git a/util/recorder.js b/util/recorder.js index cc9a7d8d..d508bb98 100644 --- a/util/recorder.js +++ b/util/recorder.js @@ -416,8 +416,6 @@ export class Recorder } async finishPage() { - //this.skipping = true; - for (const [requestId, reqresp] of this.pendingRequests.entries()) { if (reqresp.payload) { this.removeReqResp(requestId); @@ -451,7 +449,7 @@ export class Recorder } async onClosePage() { - +// Any page-specific handling before page is closed. } async onDone() { @@ -664,7 +662,6 @@ export class Recorder class AsyncFetcher { constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = null, ignoreDupe = false}) { - //super(); this.reqresp = reqresp; this.reqresp.expectedSize = expectedSize; this.reqresp.asyncLoading = true; diff --git a/util/reqresp.js b/util/reqresp.js index 41fb67a0..d5c86020 100644 --- a/util/reqresp.js +++ b/util/reqresp.js @@ -67,7 +67,6 @@ export class RequestResponseInfo this.resourceType = params.type; } - //this.loaderId = params.loaderId; } fillFetchRequestPaused(params) { diff --git a/util/warcwriter.js b/util/warcwriter.js index 2ca1923d..3f092833 100644 --- a/util/warcwriter.js +++ b/util/warcwriter.js @@ -71,9 +71,6 @@ export class WARCWriter } catch (e) { logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer"); } - if (!(count % 10)) { - //logNetwork("Writing WARC Chunk", {total, count, url, logDetails}); - } } return total; @@ -113,4 +110,4 @@ export function streamFinish(fh) { }); fh.end(); return p; -} \ No newline at end of file +} From 988bf7a08a7dec2f5c7e2f4ece9d0b3b072c92d1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Nov 2023 17:24:04 -0800 Subject: [PATCH 2/5] remove unused code, remove references to pywb --- create-login-profile.js | 15 +-------------- util/argParser.js | 2 +- util/recorder.js | 2 +- util/warcwriter.js | 2 -- 4 files changed, 3 insertions(+), 18 deletions(-) diff --git a/create-login-profile.js b/create-login-profile.js index 291b40e2..182287df 100755 --- a/create-login-profile.js +++ b/create-login-profile.js @@ -11,7 +11,6 @@ import yargs from "yargs"; import { logger } from "./util/logger.js"; -import { sleep } from "./util/timing.js"; import { Browser } from "./util/browser.js"; import { initStorage } from "./util/storage.js"; @@ -144,18 +143,6 @@ async function main() { ]); } - let useProxy = false; - - if (params.proxy) { - child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"}); - - logger.debug("Running with pywb proxy"); - - await sleep(3000); - - useProxy = true; - } - const browser = new Browser(); await browser.launch({ @@ -163,7 +150,7 @@ async function main() { headless: params.headless, signals: true, chromeOptions: { - proxy: useProxy, + proxy: false, extraArgs: [ "--window-position=0,0", `--window-size=${params.windowSize}`, diff --git a/util/argParser.js b/util/argParser.js index 2eb15839..5ca6cbd0 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -178,7 +178,7 @@ class ArgParser { }, "logging": { - describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug", + describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug", type: "array", default: ["stats"], coerce, diff --git a/util/recorder.js b/util/recorder.js index d508bb98..e5e6e06f 100644 --- a/util/recorder.js +++ b/util/recorder.js @@ -449,7 +449,7 @@ export class Recorder } async onClosePage() { -// Any page-specific handling before page is closed. + // Any page-specific handling before page is closed. } async onDone() { diff --git a/util/warcwriter.js b/util/warcwriter.js index 3f092833..0ff26666 100644 --- a/util/warcwriter.js +++ b/util/warcwriter.js @@ -60,12 +60,10 @@ export class WARCWriter async _writeRecord(record, serializer) { let total = 0; - let count = 0; const url = record.warcTargetURI; for await (const chunk of serializer) { total += chunk.length; - count++; try { this.fh.write(chunk); } catch (e) { From 034de9a78d1becd604087da16c596bf9d7e05a10 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Nov 2023 17:38:15 -0800 Subject: [PATCH 3/5] fix warcinfo test after version update --- tests/warcinfo.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/warcinfo.test.js b/tests/warcinfo.test.js index f6d797b3..2c353f9b 100644 --- a/tests/warcinfo.test.js +++ b/tests/warcinfo.test.js @@ -21,7 +21,7 @@ test("check that the warcinfo file works as expected on the command line", async expect(string.indexOf("operator: test")).toBeGreaterThan(-1); expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); - expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+ pywb \d[\w.-]+\)/)).not.toEqual(null); + expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null); expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1); From 468a00939daadcfeb8a6f8b317c6402f15c5d087 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Nov 2023 18:24:13 -0800 Subject: [PATCH 4/5] logging: reenable logging for timed out pending requests for now --- util/recorder.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util/recorder.js b/util/recorder.js index e5e6e06f..eac522ef 100644 --- a/util/recorder.js +++ b/util/recorder.js @@ -407,9 +407,9 @@ export class Recorder startPage({pageid, url}) { this.pageid = pageid; this.logDetails = {page: url, workerid: this.workerid}; - // if (this.pendingRequests && this.pendingRequests.size) { - // logger.warn("Interrupting timed out requests, moving to next page", this.logDetails, "recorder"); - // } + if (this.pendingRequests && this.pendingRequests.size) { + logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder"); + } this.pendingRequests = new Map(); this.skipIds = new Set(); this.skipping = false; From 868cd7ab48febe0f1f91431700bf896b854d59f9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Nov 2023 20:01:42 -0800 Subject: [PATCH 5/5] remove pywb dependency - only keep py-wacz - use cdxj-indexer for --generateCDX --- Dockerfile | 1 - crawler.js | 12 +++++++++++- requirements.txt | 3 --- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index a980ab90..c1ff54d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,6 @@ ENV PROXY_HOST=localhost \ WORKDIR /app ADD requirements.txt /app/ -RUN pip install 'uwsgi==2.0.21' RUN pip install -U setuptools; pip install -r requirements.txt ADD package.json /app/ diff --git a/crawler.js b/crawler.js index feae2616..c98d9275 100644 --- a/crawler.js +++ b/crawler.js @@ -937,7 +937,17 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("Generating CDX"); await fsp.mkdir(path.join(this.collDir, "indexes"), {recursive: true}); await this.crawlState.setStatus("generate-cdx"); - const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd})); + + const warcList = await fsp.readdir(path.join(this.collDir, "archive")); + const warcListFull = warcList.map((filename) => path.join(this.collDir, "archive", filename)); + + //const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd})); + const params = [ + "-o", + path.join(this.collDir, "indexes", "index.cdxj"), + ...warcListFull + ]; + const indexResult = await this.awaitProcess(child_process.spawn("cdxj-indexer", params, {cwd: this.params.cwd})); if (indexResult === 0) { logger.debug("Indexing complete, CDX successfully created"); } else { diff --git a/requirements.txt b/requirements.txt index 59dd3892..aa919c23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1 @@ -pywb>=2.7.4 -uwsgi wacz>=0.4.9 -requests[socks]