From 183f8edf10d377f85affdae83a76bed3e0f39cb5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 30 Apr 2021 12:31:14 -0700 Subject: [PATCH] Wait for Pending Requests to Finish (#47) * pending request wait: - instead of waiting for 5s, check redis key 'pywb:{coll}:pending' to see if any pending requests are still pending - keep checking key until pending requests are at 0 - requires latest pywb 2.6.0+ - should fix #44 * fix test to no longer look for waiting for 5s message * lint settings and fixes: allow constant in loops, add lint command to script * chrome: bump default image to chrome:90 image --- .eslintrc.js | 6 ++- Dockerfile | 2 +- crawler.js | 24 ++++++++++-- create-login-profile.js | 3 +- package.json | 4 ++ tests/collection_name.test.js | 9 +---- yarn.lock | 70 +++++++++++++++++++++++++++++++++++ 7 files changed, 104 insertions(+), 14 deletions(-) diff --git a/.eslintrc.js b/.eslintrc.js index a6adb144..dc587cb2 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -26,6 +26,10 @@ module.exports = { "semi": [ "error", "always" + ], + "no-constant-condition": [ + "error", + {"checkLoops": false } ] } -}; \ No newline at end of file +}; diff --git a/Dockerfile b/Dockerfile index e2248b67..55d94ec5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG BROWSER_VERSION=88 +ARG BROWSER_VERSION=90 FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome diff --git a/crawler.js b/crawler.js index b8aa3c6c..fde2a10f 100644 --- a/crawler.js +++ b/crawler.js @@ -11,6 +11,8 @@ const Sitemapper = require("sitemapper"); const { v4: uuidv4 } = require("uuid"); const warcio = require("warcio"); +const Redis = require("ioredis"); + const TextExtract = require("./textextract"); const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8"); @@ -567,9 +569,8 @@ class Crawler { } // extra wait for all resources to land into WARCs - console.log("Waiting 5s to ensure WARCs are finished"); - await this.sleep(5000); - + await this.awaitPendingClear(); + if (this.params.combineWARC) { await this.combineWARC(); } @@ -807,6 +808,23 @@ class Crawler { abort.abort(); } + async awaitPendingClear() { + console.log("Waiting to ensure pending data is written to WARC..."); + + const redis = new Redis("redis://localhost/0"); + + while (true) { + const res = await redis.get(`pywb:${this.params.collection}:pending`); + if (res === "0" || !res) { + break; + } + + console.log(`Still waiting for ${res} pending requests to finish...`); + + await this.sleep(1000); + } + } + sleep(time) { return new Promise(resolve => setTimeout(resolve, time)); } diff --git a/create-login-profile.js b/create-login-profile.js index 5847f2eb..7a73d17b 100755 --- a/create-login-profile.js +++ b/create-login-profile.js @@ -103,8 +103,7 @@ async function main() { let u, p; try { - u = await page.waitForXPath("//input[contains(@name, 'user')] or contains(@name, 'email')]"); - + u = await page.waitForXPath("//input[contains(@name, 'user') or contains(@name, 'email')]"); p = await page.waitForXPath("//input[contains(@name, 'pass') and @type='password']"); } catch (e) { diff --git a/package.json b/package.json index 8884f4f2..4333e1df 100644 --- a/package.json +++ b/package.json @@ -5,9 +5,13 @@ "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software", "license": "MIT", + "scripts": { + "lint": "eslint *.js" + }, "dependencies": { "abort-controller": "^3.0.0", "browsertrix-behaviors": "^0.2.0", + "ioredis": "^4.27.1", "node-fetch": "^2.6.1", "puppeteer-cluster": "^0.22.0", "puppeteer-core": "^5.3.1", diff --git a/tests/collection_name.test.js b/tests/collection_name.test.js index 1711e44e..9ae94464 100644 --- a/tests/collection_name.test.js +++ b/tests/collection_name.test.js @@ -6,13 +6,8 @@ test("check that the collection name is properly validation", async () => { let passed = ""; try{ - const data = await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid"); - if (data.stdout.includes("Waiting 5s to ensure WARCs are finished")){ - passed = true; - } - else{ - passed = false; - } + await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid"); + passed = true; } catch (error) { passed = false; diff --git a/yarn.lock b/yarn.lock index 7b3cd45e..471d3d7c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1226,6 +1226,11 @@ clone-response@^1.0.2: dependencies: mimic-response "^1.0.0" +cluster-key-slot@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d" + integrity sha512-2Nii8p3RwAPiFwsnZvukotvow2rIHM+yQ6ZcBXGHdniadkYGZYiGmkHJIbZPIV9nfv7m/U1IPMVVcAhoWFeklw== + co@^4.6.0: version "4.6.0" resolved "https://registry.npmjs.org/co/-/co-4.6.0.tgz" @@ -1379,6 +1384,13 @@ debug@^2.2.0, debug@^2.3.3: dependencies: ms "2.0.0" +debug@^4.3.1: + version "4.3.1" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.1.tgz#f0d229c505e0c6d8c49ac553d1b13dc183f6b2ee" + integrity sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ== + dependencies: + ms "2.1.2" + decamelize@^1.2.0: version "1.2.0" resolved "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz" @@ -1450,6 +1462,11 @@ delayed-stream@~1.0.0: resolved "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz" integrity sha1-3zrhmayt+31ECqrgsp4icrJOxhk= +denque@^1.1.0: + version "1.5.0" + resolved "https://registry.yarnpkg.com/denque/-/denque-1.5.0.tgz#773de0686ff2d8ec2ff92914316a47b73b1c73de" + integrity sha512-CYiCSgIF1p6EUByQPlGkKnP1M9g0ZV3qMIrqMqZqdwazygIA/YP2vrbcyl1h/WppKJTdl1F85cXIle+394iDAQ== + detect-newline@^3.0.0: version "3.1.0" resolved "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz" @@ -2253,6 +2270,22 @@ internal-slot@^1.0.2: has "^1.0.3" side-channel "^1.0.4" +ioredis@^4.27.1: + version "4.27.1" + resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-4.27.1.tgz#4ef947b455a1b995baa4b0d7e2c4e4f75f746421" + integrity sha512-PaFNFeBbOcEYHXAdrJuy7uesJcyvzStTM1aYMchTuky+VgKqDbXhnTJHaDsjAwcTwPx8Asatx+l2DW8zZ2xlsQ== + dependencies: + cluster-key-slot "^1.1.0" + debug "^4.3.1" + denque "^1.1.0" + lodash.defaults "^4.2.0" + lodash.flatten "^4.4.0" + p-map "^2.1.0" + redis-commands "1.7.0" + redis-errors "^1.2.0" + redis-parser "^3.0.0" + standard-as-callback "^2.1.0" + ip-regex@^2.1.0: version "2.1.0" resolved "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz" @@ -3078,6 +3111,16 @@ locate-path@^5.0.0: dependencies: p-locate "^4.1.0" +lodash.defaults@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/lodash.defaults/-/lodash.defaults-4.2.0.tgz#d09178716ffea4dde9e5fb7b37f6f0802274580c" + integrity sha1-0JF4cW/+pN3p5ft7N/bwgCJ0WAw= + +lodash.flatten@^4.4.0: + version "4.4.0" + resolved "https://registry.yarnpkg.com/lodash.flatten/-/lodash.flatten-4.4.0.tgz#f31c22225a9632d2bbf8e4addbef240aa765a61f" + integrity sha1-8xwiIlqWMtK7+OSt2+8kCqdlph8= + lodash.sortby@^4.7.0: version "4.7.0" resolved "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz" @@ -3484,6 +3527,11 @@ p-locate@^4.1.0: dependencies: p-limit "^2.2.0" +p-map@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/p-map/-/p-map-2.1.0.tgz#310928feef9c9ecc65b68b17693018a665cea175" + integrity sha512-y3b8Kpd8OAN444hxfBbFfj1FY/RjtTd8tzYwhUqNYXx0fXx2iX4maP4Qr6qhIKbQXI02wTLAda4fYUbDagTUFw== + p-try@^2.0.0: version "2.2.0" resolved "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz" @@ -3729,6 +3777,23 @@ readable-stream@^3.1.1, readable-stream@^3.4.0: string_decoder "^1.1.1" util-deprecate "^1.0.1" +redis-commands@1.7.0: + version "1.7.0" + resolved "https://registry.yarnpkg.com/redis-commands/-/redis-commands-1.7.0.tgz#15a6fea2d58281e27b1cd1acfb4b293e278c3a89" + integrity sha512-nJWqw3bTFy21hX/CPKHth6sfhZbdiHP6bTawSgQBlKOVRG7EZkfHbbHwQJnrE4vsQf0CMNE+3gJ4Fmm16vdVlQ== + +redis-errors@^1.0.0, redis-errors@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/redis-errors/-/redis-errors-1.2.0.tgz#eb62d2adb15e4eaf4610c04afe1529384250abad" + integrity sha1-62LSrbFeTq9GEMBK/hUpOEJQq60= + +redis-parser@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-3.0.0.tgz#b66d828cdcafe6b4b8a428a7def4c6bcac31c8b4" + integrity sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ= + dependencies: + redis-errors "^1.0.0" + regex-not@^1.0.0, regex-not@^1.0.2: version "1.0.2" resolved "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz" @@ -4164,6 +4229,11 @@ stack-utils@^2.0.2: dependencies: escape-string-regexp "^2.0.0" +standard-as-callback@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/standard-as-callback/-/standard-as-callback-2.1.0.tgz#8953fc05359868a77b5b9739a665c5977bb7df45" + integrity sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A== + static-extend@^0.1.1: version "0.1.2" resolved "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz"