mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Wait for Pending Requests to Finish (#47)
* pending request wait: - instead of waiting for 5s, check redis key 'pywb:{coll}:pending' to see if any pending requests are still pending - keep checking key until pending requests are at 0 - requires latest pywb 2.6.0+ - should fix #44 * fix test to no longer look for waiting for 5s message * lint settings and fixes: allow constant in loops, add lint command to script * chrome: bump default image to chrome:90 image
This commit is contained in:
parent
9d577dac57
commit
183f8edf10
7 changed files with 104 additions and 14 deletions
|
@ -26,6 +26,10 @@ module.exports = {
|
||||||
"semi": [
|
"semi": [
|
||||||
"error",
|
"error",
|
||||||
"always"
|
"always"
|
||||||
|
],
|
||||||
|
"no-constant-condition": [
|
||||||
|
"error",
|
||||||
|
{"checkLoops": false }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
ARG BROWSER_VERSION=88
|
ARG BROWSER_VERSION=90
|
||||||
|
|
||||||
FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome
|
FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome
|
||||||
|
|
||||||
|
|
24
crawler.js
24
crawler.js
|
@ -11,6 +11,8 @@ const Sitemapper = require("sitemapper");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const warcio = require("warcio");
|
const warcio = require("warcio");
|
||||||
|
|
||||||
|
const Redis = require("ioredis");
|
||||||
|
|
||||||
const TextExtract = require("./textextract");
|
const TextExtract = require("./textextract");
|
||||||
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
|
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
|
||||||
|
|
||||||
|
@ -567,9 +569,8 @@ class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
// extra wait for all resources to land into WARCs
|
// extra wait for all resources to land into WARCs
|
||||||
console.log("Waiting 5s to ensure WARCs are finished");
|
await this.awaitPendingClear();
|
||||||
await this.sleep(5000);
|
|
||||||
|
|
||||||
if (this.params.combineWARC) {
|
if (this.params.combineWARC) {
|
||||||
await this.combineWARC();
|
await this.combineWARC();
|
||||||
}
|
}
|
||||||
|
@ -807,6 +808,23 @@ class Crawler {
|
||||||
abort.abort();
|
abort.abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async awaitPendingClear() {
|
||||||
|
console.log("Waiting to ensure pending data is written to WARC...");
|
||||||
|
|
||||||
|
const redis = new Redis("redis://localhost/0");
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const res = await redis.get(`pywb:${this.params.collection}:pending`);
|
||||||
|
if (res === "0" || !res) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Still waiting for ${res} pending requests to finish...`);
|
||||||
|
|
||||||
|
await this.sleep(1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sleep(time) {
|
sleep(time) {
|
||||||
return new Promise(resolve => setTimeout(resolve, time));
|
return new Promise(resolve => setTimeout(resolve, time));
|
||||||
}
|
}
|
||||||
|
|
|
@ -103,8 +103,7 @@ async function main() {
|
||||||
let u, p;
|
let u, p;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
u = await page.waitForXPath("//input[contains(@name, 'user')] or contains(@name, 'email')]");
|
u = await page.waitForXPath("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
||||||
|
|
||||||
p = await page.waitForXPath("//input[contains(@name, 'pass') and @type='password']");
|
p = await page.waitForXPath("//input[contains(@name, 'pass') and @type='password']");
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|
|
@ -5,9 +5,13 @@
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"scripts": {
|
||||||
|
"lint": "eslint *.js"
|
||||||
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"abort-controller": "^3.0.0",
|
"abort-controller": "^3.0.0",
|
||||||
"browsertrix-behaviors": "^0.2.0",
|
"browsertrix-behaviors": "^0.2.0",
|
||||||
|
"ioredis": "^4.27.1",
|
||||||
"node-fetch": "^2.6.1",
|
"node-fetch": "^2.6.1",
|
||||||
"puppeteer-cluster": "^0.22.0",
|
"puppeteer-cluster": "^0.22.0",
|
||||||
"puppeteer-core": "^5.3.1",
|
"puppeteer-core": "^5.3.1",
|
||||||
|
|
|
@ -6,13 +6,8 @@ test("check that the collection name is properly validation", async () => {
|
||||||
let passed = "";
|
let passed = "";
|
||||||
|
|
||||||
try{
|
try{
|
||||||
const data = await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
||||||
if (data.stdout.includes("Waiting 5s to ensure WARCs are finished")){
|
passed = true;
|
||||||
passed = true;
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
passed = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (error) {
|
catch (error) {
|
||||||
passed = false;
|
passed = false;
|
||||||
|
|
70
yarn.lock
70
yarn.lock
|
@ -1226,6 +1226,11 @@ clone-response@^1.0.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
mimic-response "^1.0.0"
|
mimic-response "^1.0.0"
|
||||||
|
|
||||||
|
cluster-key-slot@^1.1.0:
|
||||||
|
version "1.1.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d"
|
||||||
|
integrity sha512-2Nii8p3RwAPiFwsnZvukotvow2rIHM+yQ6ZcBXGHdniadkYGZYiGmkHJIbZPIV9nfv7m/U1IPMVVcAhoWFeklw==
|
||||||
|
|
||||||
co@^4.6.0:
|
co@^4.6.0:
|
||||||
version "4.6.0"
|
version "4.6.0"
|
||||||
resolved "https://registry.npmjs.org/co/-/co-4.6.0.tgz"
|
resolved "https://registry.npmjs.org/co/-/co-4.6.0.tgz"
|
||||||
|
@ -1379,6 +1384,13 @@ debug@^2.2.0, debug@^2.3.3:
|
||||||
dependencies:
|
dependencies:
|
||||||
ms "2.0.0"
|
ms "2.0.0"
|
||||||
|
|
||||||
|
debug@^4.3.1:
|
||||||
|
version "4.3.1"
|
||||||
|
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.1.tgz#f0d229c505e0c6d8c49ac553d1b13dc183f6b2ee"
|
||||||
|
integrity sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ==
|
||||||
|
dependencies:
|
||||||
|
ms "2.1.2"
|
||||||
|
|
||||||
decamelize@^1.2.0:
|
decamelize@^1.2.0:
|
||||||
version "1.2.0"
|
version "1.2.0"
|
||||||
resolved "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz"
|
resolved "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz"
|
||||||
|
@ -1450,6 +1462,11 @@ delayed-stream@~1.0.0:
|
||||||
resolved "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz"
|
resolved "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz"
|
||||||
integrity sha1-3zrhmayt+31ECqrgsp4icrJOxhk=
|
integrity sha1-3zrhmayt+31ECqrgsp4icrJOxhk=
|
||||||
|
|
||||||
|
denque@^1.1.0:
|
||||||
|
version "1.5.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/denque/-/denque-1.5.0.tgz#773de0686ff2d8ec2ff92914316a47b73b1c73de"
|
||||||
|
integrity sha512-CYiCSgIF1p6EUByQPlGkKnP1M9g0ZV3qMIrqMqZqdwazygIA/YP2vrbcyl1h/WppKJTdl1F85cXIle+394iDAQ==
|
||||||
|
|
||||||
detect-newline@^3.0.0:
|
detect-newline@^3.0.0:
|
||||||
version "3.1.0"
|
version "3.1.0"
|
||||||
resolved "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz"
|
resolved "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz"
|
||||||
|
@ -2253,6 +2270,22 @@ internal-slot@^1.0.2:
|
||||||
has "^1.0.3"
|
has "^1.0.3"
|
||||||
side-channel "^1.0.4"
|
side-channel "^1.0.4"
|
||||||
|
|
||||||
|
ioredis@^4.27.1:
|
||||||
|
version "4.27.1"
|
||||||
|
resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-4.27.1.tgz#4ef947b455a1b995baa4b0d7e2c4e4f75f746421"
|
||||||
|
integrity sha512-PaFNFeBbOcEYHXAdrJuy7uesJcyvzStTM1aYMchTuky+VgKqDbXhnTJHaDsjAwcTwPx8Asatx+l2DW8zZ2xlsQ==
|
||||||
|
dependencies:
|
||||||
|
cluster-key-slot "^1.1.0"
|
||||||
|
debug "^4.3.1"
|
||||||
|
denque "^1.1.0"
|
||||||
|
lodash.defaults "^4.2.0"
|
||||||
|
lodash.flatten "^4.4.0"
|
||||||
|
p-map "^2.1.0"
|
||||||
|
redis-commands "1.7.0"
|
||||||
|
redis-errors "^1.2.0"
|
||||||
|
redis-parser "^3.0.0"
|
||||||
|
standard-as-callback "^2.1.0"
|
||||||
|
|
||||||
ip-regex@^2.1.0:
|
ip-regex@^2.1.0:
|
||||||
version "2.1.0"
|
version "2.1.0"
|
||||||
resolved "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz"
|
resolved "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz"
|
||||||
|
@ -3078,6 +3111,16 @@ locate-path@^5.0.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
p-locate "^4.1.0"
|
p-locate "^4.1.0"
|
||||||
|
|
||||||
|
lodash.defaults@^4.2.0:
|
||||||
|
version "4.2.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/lodash.defaults/-/lodash.defaults-4.2.0.tgz#d09178716ffea4dde9e5fb7b37f6f0802274580c"
|
||||||
|
integrity sha1-0JF4cW/+pN3p5ft7N/bwgCJ0WAw=
|
||||||
|
|
||||||
|
lodash.flatten@^4.4.0:
|
||||||
|
version "4.4.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/lodash.flatten/-/lodash.flatten-4.4.0.tgz#f31c22225a9632d2bbf8e4addbef240aa765a61f"
|
||||||
|
integrity sha1-8xwiIlqWMtK7+OSt2+8kCqdlph8=
|
||||||
|
|
||||||
lodash.sortby@^4.7.0:
|
lodash.sortby@^4.7.0:
|
||||||
version "4.7.0"
|
version "4.7.0"
|
||||||
resolved "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz"
|
resolved "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz"
|
||||||
|
@ -3484,6 +3527,11 @@ p-locate@^4.1.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
p-limit "^2.2.0"
|
p-limit "^2.2.0"
|
||||||
|
|
||||||
|
p-map@^2.1.0:
|
||||||
|
version "2.1.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/p-map/-/p-map-2.1.0.tgz#310928feef9c9ecc65b68b17693018a665cea175"
|
||||||
|
integrity sha512-y3b8Kpd8OAN444hxfBbFfj1FY/RjtTd8tzYwhUqNYXx0fXx2iX4maP4Qr6qhIKbQXI02wTLAda4fYUbDagTUFw==
|
||||||
|
|
||||||
p-try@^2.0.0:
|
p-try@^2.0.0:
|
||||||
version "2.2.0"
|
version "2.2.0"
|
||||||
resolved "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz"
|
resolved "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz"
|
||||||
|
@ -3729,6 +3777,23 @@ readable-stream@^3.1.1, readable-stream@^3.4.0:
|
||||||
string_decoder "^1.1.1"
|
string_decoder "^1.1.1"
|
||||||
util-deprecate "^1.0.1"
|
util-deprecate "^1.0.1"
|
||||||
|
|
||||||
|
redis-commands@1.7.0:
|
||||||
|
version "1.7.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/redis-commands/-/redis-commands-1.7.0.tgz#15a6fea2d58281e27b1cd1acfb4b293e278c3a89"
|
||||||
|
integrity sha512-nJWqw3bTFy21hX/CPKHth6sfhZbdiHP6bTawSgQBlKOVRG7EZkfHbbHwQJnrE4vsQf0CMNE+3gJ4Fmm16vdVlQ==
|
||||||
|
|
||||||
|
redis-errors@^1.0.0, redis-errors@^1.2.0:
|
||||||
|
version "1.2.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/redis-errors/-/redis-errors-1.2.0.tgz#eb62d2adb15e4eaf4610c04afe1529384250abad"
|
||||||
|
integrity sha1-62LSrbFeTq9GEMBK/hUpOEJQq60=
|
||||||
|
|
||||||
|
redis-parser@^3.0.0:
|
||||||
|
version "3.0.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-3.0.0.tgz#b66d828cdcafe6b4b8a428a7def4c6bcac31c8b4"
|
||||||
|
integrity sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ=
|
||||||
|
dependencies:
|
||||||
|
redis-errors "^1.0.0"
|
||||||
|
|
||||||
regex-not@^1.0.0, regex-not@^1.0.2:
|
regex-not@^1.0.0, regex-not@^1.0.2:
|
||||||
version "1.0.2"
|
version "1.0.2"
|
||||||
resolved "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz"
|
resolved "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz"
|
||||||
|
@ -4164,6 +4229,11 @@ stack-utils@^2.0.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
escape-string-regexp "^2.0.0"
|
escape-string-regexp "^2.0.0"
|
||||||
|
|
||||||
|
standard-as-callback@^2.1.0:
|
||||||
|
version "2.1.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/standard-as-callback/-/standard-as-callback-2.1.0.tgz#8953fc05359868a77b5b9739a665c5977bb7df45"
|
||||||
|
integrity sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A==
|
||||||
|
|
||||||
static-extend@^0.1.1:
|
static-extend@^0.1.1:
|
||||||
version "0.1.2"
|
version "0.1.2"
|
||||||
resolved "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz"
|
resolved "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue