mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Wait for Pending Requests to Finish (#47)
* pending request wait: - instead of waiting for 5s, check redis key 'pywb:{coll}:pending' to see if any pending requests are still pending - keep checking key until pending requests are at 0 - requires latest pywb 2.6.0+ - should fix #44 * fix test to no longer look for waiting for 5s message * lint settings and fixes: allow constant in loops, add lint command to script * chrome: bump default image to chrome:90 image
This commit is contained in:
parent
9d577dac57
commit
183f8edf10
7 changed files with 104 additions and 14 deletions
|
@ -26,6 +26,10 @@ module.exports = {
|
|||
"semi": [
|
||||
"error",
|
||||
"always"
|
||||
],
|
||||
"no-constant-condition": [
|
||||
"error",
|
||||
{"checkLoops": false }
|
||||
]
|
||||
}
|
||||
};
|
|
@ -1,4 +1,4 @@
|
|||
ARG BROWSER_VERSION=88
|
||||
ARG BROWSER_VERSION=90
|
||||
|
||||
FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome
|
||||
|
||||
|
|
22
crawler.js
22
crawler.js
|
@ -11,6 +11,8 @@ const Sitemapper = require("sitemapper");
|
|||
const { v4: uuidv4 } = require("uuid");
|
||||
const warcio = require("warcio");
|
||||
|
||||
const Redis = require("ioredis");
|
||||
|
||||
const TextExtract = require("./textextract");
|
||||
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
|
||||
|
||||
|
@ -567,8 +569,7 @@ class Crawler {
|
|||
}
|
||||
|
||||
// extra wait for all resources to land into WARCs
|
||||
console.log("Waiting 5s to ensure WARCs are finished");
|
||||
await this.sleep(5000);
|
||||
await this.awaitPendingClear();
|
||||
|
||||
if (this.params.combineWARC) {
|
||||
await this.combineWARC();
|
||||
|
@ -807,6 +808,23 @@ class Crawler {
|
|||
abort.abort();
|
||||
}
|
||||
|
||||
async awaitPendingClear() {
|
||||
console.log("Waiting to ensure pending data is written to WARC...");
|
||||
|
||||
const redis = new Redis("redis://localhost/0");
|
||||
|
||||
while (true) {
|
||||
const res = await redis.get(`pywb:${this.params.collection}:pending`);
|
||||
if (res === "0" || !res) {
|
||||
break;
|
||||
}
|
||||
|
||||
console.log(`Still waiting for ${res} pending requests to finish...`);
|
||||
|
||||
await this.sleep(1000);
|
||||
}
|
||||
}
|
||||
|
||||
sleep(time) {
|
||||
return new Promise(resolve => setTimeout(resolve, time));
|
||||
}
|
||||
|
|
|
@ -103,8 +103,7 @@ async function main() {
|
|||
let u, p;
|
||||
|
||||
try {
|
||||
u = await page.waitForXPath("//input[contains(@name, 'user')] or contains(@name, 'email')]");
|
||||
|
||||
u = await page.waitForXPath("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
||||
p = await page.waitForXPath("//input[contains(@name, 'pass') and @type='password']");
|
||||
|
||||
} catch (e) {
|
||||
|
|
|
@ -5,9 +5,13 @@
|
|||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
"license": "MIT",
|
||||
"scripts": {
|
||||
"lint": "eslint *.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"abort-controller": "^3.0.0",
|
||||
"browsertrix-behaviors": "^0.2.0",
|
||||
"ioredis": "^4.27.1",
|
||||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "^0.22.0",
|
||||
"puppeteer-core": "^5.3.1",
|
||||
|
|
|
@ -6,14 +6,9 @@ test("check that the collection name is properly validation", async () => {
|
|||
let passed = "";
|
||||
|
||||
try{
|
||||
const data = await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
||||
if (data.stdout.includes("Waiting 5s to ensure WARCs are finished")){
|
||||
await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
||||
passed = true;
|
||||
}
|
||||
else{
|
||||
passed = false;
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
passed = false;
|
||||
}
|
||||
|
|
70
yarn.lock
70
yarn.lock
|
@ -1226,6 +1226,11 @@ clone-response@^1.0.2:
|
|||
dependencies:
|
||||
mimic-response "^1.0.0"
|
||||
|
||||
cluster-key-slot@^1.1.0:
|
||||
version "1.1.0"
|
||||
resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d"
|
||||
integrity sha512-2Nii8p3RwAPiFwsnZvukotvow2rIHM+yQ6ZcBXGHdniadkYGZYiGmkHJIbZPIV9nfv7m/U1IPMVVcAhoWFeklw==
|
||||
|
||||
co@^4.6.0:
|
||||
version "4.6.0"
|
||||
resolved "https://registry.npmjs.org/co/-/co-4.6.0.tgz"
|
||||
|
@ -1379,6 +1384,13 @@ debug@^2.2.0, debug@^2.3.3:
|
|||
dependencies:
|
||||
ms "2.0.0"
|
||||
|
||||
debug@^4.3.1:
|
||||
version "4.3.1"
|
||||
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.1.tgz#f0d229c505e0c6d8c49ac553d1b13dc183f6b2ee"
|
||||
integrity sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ==
|
||||
dependencies:
|
||||
ms "2.1.2"
|
||||
|
||||
decamelize@^1.2.0:
|
||||
version "1.2.0"
|
||||
resolved "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz"
|
||||
|
@ -1450,6 +1462,11 @@ delayed-stream@~1.0.0:
|
|||
resolved "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz"
|
||||
integrity sha1-3zrhmayt+31ECqrgsp4icrJOxhk=
|
||||
|
||||
denque@^1.1.0:
|
||||
version "1.5.0"
|
||||
resolved "https://registry.yarnpkg.com/denque/-/denque-1.5.0.tgz#773de0686ff2d8ec2ff92914316a47b73b1c73de"
|
||||
integrity sha512-CYiCSgIF1p6EUByQPlGkKnP1M9g0ZV3qMIrqMqZqdwazygIA/YP2vrbcyl1h/WppKJTdl1F85cXIle+394iDAQ==
|
||||
|
||||
detect-newline@^3.0.0:
|
||||
version "3.1.0"
|
||||
resolved "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz"
|
||||
|
@ -2253,6 +2270,22 @@ internal-slot@^1.0.2:
|
|||
has "^1.0.3"
|
||||
side-channel "^1.0.4"
|
||||
|
||||
ioredis@^4.27.1:
|
||||
version "4.27.1"
|
||||
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-4.27.1.tgz#4ef947b455a1b995baa4b0d7e2c4e4f75f746421"
|
||||
integrity sha512-PaFNFeBbOcEYHXAdrJuy7uesJcyvzStTM1aYMchTuky+VgKqDbXhnTJHaDsjAwcTwPx8Asatx+l2DW8zZ2xlsQ==
|
||||
dependencies:
|
||||
cluster-key-slot "^1.1.0"
|
||||
debug "^4.3.1"
|
||||
denque "^1.1.0"
|
||||
lodash.defaults "^4.2.0"
|
||||
lodash.flatten "^4.4.0"
|
||||
p-map "^2.1.0"
|
||||
redis-commands "1.7.0"
|
||||
redis-errors "^1.2.0"
|
||||
redis-parser "^3.0.0"
|
||||
standard-as-callback "^2.1.0"
|
||||
|
||||
ip-regex@^2.1.0:
|
||||
version "2.1.0"
|
||||
resolved "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz"
|
||||
|
@ -3078,6 +3111,16 @@ locate-path@^5.0.0:
|
|||
dependencies:
|
||||
p-locate "^4.1.0"
|
||||
|
||||
lodash.defaults@^4.2.0:
|
||||
version "4.2.0"
|
||||
resolved "https://registry.yarnpkg.com/lodash.defaults/-/lodash.defaults-4.2.0.tgz#d09178716ffea4dde9e5fb7b37f6f0802274580c"
|
||||
integrity sha1-0JF4cW/+pN3p5ft7N/bwgCJ0WAw=
|
||||
|
||||
lodash.flatten@^4.4.0:
|
||||
version "4.4.0"
|
||||
resolved "https://registry.yarnpkg.com/lodash.flatten/-/lodash.flatten-4.4.0.tgz#f31c22225a9632d2bbf8e4addbef240aa765a61f"
|
||||
integrity sha1-8xwiIlqWMtK7+OSt2+8kCqdlph8=
|
||||
|
||||
lodash.sortby@^4.7.0:
|
||||
version "4.7.0"
|
||||
resolved "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz"
|
||||
|
@ -3484,6 +3527,11 @@ p-locate@^4.1.0:
|
|||
dependencies:
|
||||
p-limit "^2.2.0"
|
||||
|
||||
p-map@^2.1.0:
|
||||
version "2.1.0"
|
||||
resolved "https://registry.yarnpkg.com/p-map/-/p-map-2.1.0.tgz#310928feef9c9ecc65b68b17693018a665cea175"
|
||||
integrity sha512-y3b8Kpd8OAN444hxfBbFfj1FY/RjtTd8tzYwhUqNYXx0fXx2iX4maP4Qr6qhIKbQXI02wTLAda4fYUbDagTUFw==
|
||||
|
||||
p-try@^2.0.0:
|
||||
version "2.2.0"
|
||||
resolved "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz"
|
||||
|
@ -3729,6 +3777,23 @@ readable-stream@^3.1.1, readable-stream@^3.4.0:
|
|||
string_decoder "^1.1.1"
|
||||
util-deprecate "^1.0.1"
|
||||
|
||||
redis-commands@1.7.0:
|
||||
version "1.7.0"
|
||||
resolved "https://registry.yarnpkg.com/redis-commands/-/redis-commands-1.7.0.tgz#15a6fea2d58281e27b1cd1acfb4b293e278c3a89"
|
||||
integrity sha512-nJWqw3bTFy21hX/CPKHth6sfhZbdiHP6bTawSgQBlKOVRG7EZkfHbbHwQJnrE4vsQf0CMNE+3gJ4Fmm16vdVlQ==
|
||||
|
||||
redis-errors@^1.0.0, redis-errors@^1.2.0:
|
||||
version "1.2.0"
|
||||
resolved "https://registry.yarnpkg.com/redis-errors/-/redis-errors-1.2.0.tgz#eb62d2adb15e4eaf4610c04afe1529384250abad"
|
||||
integrity sha1-62LSrbFeTq9GEMBK/hUpOEJQq60=
|
||||
|
||||
redis-parser@^3.0.0:
|
||||
version "3.0.0"
|
||||
resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-3.0.0.tgz#b66d828cdcafe6b4b8a428a7def4c6bcac31c8b4"
|
||||
integrity sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ=
|
||||
dependencies:
|
||||
redis-errors "^1.0.0"
|
||||
|
||||
regex-not@^1.0.0, regex-not@^1.0.2:
|
||||
version "1.0.2"
|
||||
resolved "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz"
|
||||
|
@ -4164,6 +4229,11 @@ stack-utils@^2.0.2:
|
|||
dependencies:
|
||||
escape-string-regexp "^2.0.0"
|
||||
|
||||
standard-as-callback@^2.1.0:
|
||||
version "2.1.0"
|
||||
resolved "https://registry.yarnpkg.com/standard-as-callback/-/standard-as-callback-2.1.0.tgz#8953fc05359868a77b5b9739a665c5977bb7df45"
|
||||
integrity sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A==
|
||||
|
||||
static-extend@^0.1.1:
|
||||
version "0.1.2"
|
||||
resolved "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue