Wait for Pending Requests to Finish (#47)

* pending request wait:
- instead of waiting for 5s, check redis key 'pywb:{coll}:pending' to see if any pending requests are still pending
- keep checking key until pending requests are at 0
- requires latest pywb 2.6.0+
- should fix #44

* fix test to no longer look for waiting for 5s message

* lint settings and fixes: allow constant in loops, add lint command to script

* chrome: bump default image to chrome:90 image
This commit is contained in:
Ilya Kreymer 2021-04-30 12:31:14 -07:00 committed by GitHub
parent 9d577dac57
commit 183f8edf10
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 104 additions and 14 deletions

View file

@ -26,6 +26,10 @@ module.exports = {
"semi": [
"error",
"always"
],
"no-constant-condition": [
"error",
{"checkLoops": false }
]
}
};

View file

@ -1,4 +1,4 @@
ARG BROWSER_VERSION=88
ARG BROWSER_VERSION=90
FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome

View file

@ -11,6 +11,8 @@ const Sitemapper = require("sitemapper");
const { v4: uuidv4 } = require("uuid");
const warcio = require("warcio");
const Redis = require("ioredis");
const TextExtract = require("./textextract");
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
@ -567,8 +569,7 @@ class Crawler {
}
// extra wait for all resources to land into WARCs
console.log("Waiting 5s to ensure WARCs are finished");
await this.sleep(5000);
await this.awaitPendingClear();
if (this.params.combineWARC) {
await this.combineWARC();
@ -807,6 +808,23 @@ class Crawler {
abort.abort();
}
async awaitPendingClear() {
console.log("Waiting to ensure pending data is written to WARC...");
const redis = new Redis("redis://localhost/0");
while (true) {
const res = await redis.get(`pywb:${this.params.collection}:pending`);
if (res === "0" || !res) {
break;
}
console.log(`Still waiting for ${res} pending requests to finish...`);
await this.sleep(1000);
}
}
sleep(time) {
return new Promise(resolve => setTimeout(resolve, time));
}

View file

@ -103,8 +103,7 @@ async function main() {
let u, p;
try {
u = await page.waitForXPath("//input[contains(@name, 'user')] or contains(@name, 'email')]");
u = await page.waitForXPath("//input[contains(@name, 'user') or contains(@name, 'email')]");
p = await page.waitForXPath("//input[contains(@name, 'pass') and @type='password']");
} catch (e) {

View file

@ -5,9 +5,13 @@
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
"license": "MIT",
"scripts": {
"lint": "eslint *.js"
},
"dependencies": {
"abort-controller": "^3.0.0",
"browsertrix-behaviors": "^0.2.0",
"ioredis": "^4.27.1",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1",

View file

@ -6,14 +6,9 @@ test("check that the collection name is properly validation", async () => {
let passed = "";
try{
const data = await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
if (data.stdout.includes("Waiting 5s to ensure WARCs are finished")){
await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
passed = true;
}
else{
passed = false;
}
}
catch (error) {
passed = false;
}

View file

@ -1226,6 +1226,11 @@ clone-response@^1.0.2:
dependencies:
mimic-response "^1.0.0"
cluster-key-slot@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d"
integrity sha512-2Nii8p3RwAPiFwsnZvukotvow2rIHM+yQ6ZcBXGHdniadkYGZYiGmkHJIbZPIV9nfv7m/U1IPMVVcAhoWFeklw==
co@^4.6.0:
version "4.6.0"
resolved "https://registry.npmjs.org/co/-/co-4.6.0.tgz"
@ -1379,6 +1384,13 @@ debug@^2.2.0, debug@^2.3.3:
dependencies:
ms "2.0.0"
debug@^4.3.1:
version "4.3.1"
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.1.tgz#f0d229c505e0c6d8c49ac553d1b13dc183f6b2ee"
integrity sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ==
dependencies:
ms "2.1.2"
decamelize@^1.2.0:
version "1.2.0"
resolved "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz"
@ -1450,6 +1462,11 @@ delayed-stream@~1.0.0:
resolved "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz"
integrity sha1-3zrhmayt+31ECqrgsp4icrJOxhk=
denque@^1.1.0:
version "1.5.0"
resolved "https://registry.yarnpkg.com/denque/-/denque-1.5.0.tgz#773de0686ff2d8ec2ff92914316a47b73b1c73de"
integrity sha512-CYiCSgIF1p6EUByQPlGkKnP1M9g0ZV3qMIrqMqZqdwazygIA/YP2vrbcyl1h/WppKJTdl1F85cXIle+394iDAQ==
detect-newline@^3.0.0:
version "3.1.0"
resolved "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz"
@ -2253,6 +2270,22 @@ internal-slot@^1.0.2:
has "^1.0.3"
side-channel "^1.0.4"
ioredis@^4.27.1:
version "4.27.1"
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-4.27.1.tgz#4ef947b455a1b995baa4b0d7e2c4e4f75f746421"
integrity sha512-PaFNFeBbOcEYHXAdrJuy7uesJcyvzStTM1aYMchTuky+VgKqDbXhnTJHaDsjAwcTwPx8Asatx+l2DW8zZ2xlsQ==
dependencies:
cluster-key-slot "^1.1.0"
debug "^4.3.1"
denque "^1.1.0"
lodash.defaults "^4.2.0"
lodash.flatten "^4.4.0"
p-map "^2.1.0"
redis-commands "1.7.0"
redis-errors "^1.2.0"
redis-parser "^3.0.0"
standard-as-callback "^2.1.0"
ip-regex@^2.1.0:
version "2.1.0"
resolved "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz"
@ -3078,6 +3111,16 @@ locate-path@^5.0.0:
dependencies:
p-locate "^4.1.0"
lodash.defaults@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/lodash.defaults/-/lodash.defaults-4.2.0.tgz#d09178716ffea4dde9e5fb7b37f6f0802274580c"
integrity sha1-0JF4cW/+pN3p5ft7N/bwgCJ0WAw=
lodash.flatten@^4.4.0:
version "4.4.0"
resolved "https://registry.yarnpkg.com/lodash.flatten/-/lodash.flatten-4.4.0.tgz#f31c22225a9632d2bbf8e4addbef240aa765a61f"
integrity sha1-8xwiIlqWMtK7+OSt2+8kCqdlph8=
lodash.sortby@^4.7.0:
version "4.7.0"
resolved "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz"
@ -3484,6 +3527,11 @@ p-locate@^4.1.0:
dependencies:
p-limit "^2.2.0"
p-map@^2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/p-map/-/p-map-2.1.0.tgz#310928feef9c9ecc65b68b17693018a665cea175"
integrity sha512-y3b8Kpd8OAN444hxfBbFfj1FY/RjtTd8tzYwhUqNYXx0fXx2iX4maP4Qr6qhIKbQXI02wTLAda4fYUbDagTUFw==
p-try@^2.0.0:
version "2.2.0"
resolved "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz"
@ -3729,6 +3777,23 @@ readable-stream@^3.1.1, readable-stream@^3.4.0:
string_decoder "^1.1.1"
util-deprecate "^1.0.1"
redis-commands@1.7.0:
version "1.7.0"
resolved "https://registry.yarnpkg.com/redis-commands/-/redis-commands-1.7.0.tgz#15a6fea2d58281e27b1cd1acfb4b293e278c3a89"
integrity sha512-nJWqw3bTFy21hX/CPKHth6sfhZbdiHP6bTawSgQBlKOVRG7EZkfHbbHwQJnrE4vsQf0CMNE+3gJ4Fmm16vdVlQ==
redis-errors@^1.0.0, redis-errors@^1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/redis-errors/-/redis-errors-1.2.0.tgz#eb62d2adb15e4eaf4610c04afe1529384250abad"
integrity sha1-62LSrbFeTq9GEMBK/hUpOEJQq60=
redis-parser@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-3.0.0.tgz#b66d828cdcafe6b4b8a428a7def4c6bcac31c8b4"
integrity sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ=
dependencies:
redis-errors "^1.0.0"
regex-not@^1.0.0, regex-not@^1.0.2:
version "1.0.2"
resolved "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz"
@ -4164,6 +4229,11 @@ stack-utils@^2.0.2:
dependencies:
escape-string-regexp "^2.0.0"
standard-as-callback@^2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/standard-as-callback/-/standard-as-callback-2.1.0.tgz#8953fc05359868a77b5b9739a665c5977bb7df45"
integrity sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A==
static-extend@^0.1.1:
version "0.1.2"
resolved "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz"