State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
const Job = require("puppeteer-cluster/dist/Job").default;
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
class BaseState
|
|
|
|
{
|
|
|
|
constructor() {
|
|
|
|
this.drainMax = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
async setDrain() {
|
|
|
|
this.drainMax = (await this.numPending()) + (await this.numDone());
|
|
|
|
}
|
|
|
|
|
|
|
|
async size() {
|
|
|
|
return this.drainMax ? 0 : await this.realSize();
|
|
|
|
}
|
|
|
|
|
|
|
|
async finished() {
|
|
|
|
return await this.realSize() == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
async numSeen() {
|
|
|
|
return this.drainMax ? this.drainMax : await this.numRealSeen();
|
|
|
|
}
|
|
|
|
|
|
|
|
recheckScope(data, seeds) {
|
|
|
|
const seed = seeds[data.seedId];
|
|
|
|
|
2022-01-15 09:03:09 -08:00
|
|
|
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// ============================================================================
|
|
|
|
class MemTaskCallbacks
|
|
|
|
{
|
|
|
|
constructor(data, state) {
|
|
|
|
this.data = data;
|
|
|
|
this.state = state;
|
|
|
|
|
|
|
|
this.json = JSON.stringify(this.data);
|
|
|
|
this.state.pending.add(this.json);
|
|
|
|
}
|
|
|
|
|
|
|
|
start() {
|
|
|
|
this.state.pending.delete(this.json);
|
|
|
|
|
|
|
|
this.data.started = new Date().toISOString();
|
|
|
|
this.json = JSON.stringify(this.data);
|
|
|
|
|
|
|
|
this.state.pending.add(this.json);
|
|
|
|
}
|
|
|
|
|
|
|
|
resolve() {
|
|
|
|
this.state.pending.delete(this.json);
|
|
|
|
|
|
|
|
this.data.finished = new Date().toISOString();
|
|
|
|
|
|
|
|
this.state.done.unshift(this.data);
|
|
|
|
}
|
|
|
|
|
|
|
|
reject(e) {
|
|
|
|
this.state.pending.delete(this.json);
|
|
|
|
console.warn(`URL Load Failed: ${this.data.url}, Reason: ${e}`);
|
|
|
|
this.data.failed = true;
|
|
|
|
this.state.done.unshift(this.data);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
// ============================================================================
|
|
|
|
class MemoryCrawlState extends BaseState
|
|
|
|
{
|
|
|
|
constructor() {
|
|
|
|
super();
|
|
|
|
this.seenList = new Set();
|
|
|
|
this.queue = [];
|
|
|
|
this.pending = new Set();
|
|
|
|
this.done = [];
|
|
|
|
}
|
|
|
|
|
|
|
|
push(job) {
|
2022-02-08 15:31:55 -08:00
|
|
|
this.pending.delete(JSON.stringify(job.data));
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
this.queue.unshift(job.data);
|
|
|
|
}
|
|
|
|
|
|
|
|
realSize() {
|
|
|
|
return this.queue.length;
|
|
|
|
}
|
|
|
|
|
|
|
|
shift() {
|
|
|
|
const data = this.queue.pop();
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
const callback = new MemTaskCallbacks(data, this);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
return new Job(data, undefined, callback);
|
|
|
|
}
|
|
|
|
|
|
|
|
has(url) {
|
|
|
|
return this.seenList.has(url);
|
|
|
|
}
|
|
|
|
|
|
|
|
add(url) {
|
|
|
|
return this.seenList.add(url);
|
|
|
|
}
|
|
|
|
|
|
|
|
async serialize() {
|
|
|
|
const queued = this.queue.map(x => JSON.stringify(x));
|
|
|
|
const pending = Array.from(this.pending.values());
|
|
|
|
const done = this.done.map(x => JSON.stringify(x));
|
|
|
|
|
|
|
|
return {queued, pending, done};
|
|
|
|
}
|
|
|
|
|
|
|
|
async load(state, seeds, checkScope=false) {
|
|
|
|
for (const json of state.queued) {
|
|
|
|
const data = JSON.parse(json);
|
|
|
|
if (checkScope && !this.recheckScope(data, seeds)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
this.queue.push(data);
|
|
|
|
this.seenList.add(data.url);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const json of state.pending) {
|
|
|
|
const data = JSON.parse(json);
|
|
|
|
if (checkScope && !this.recheckScope(data, seeds)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
this.queue.push(data);
|
|
|
|
this.seenList.add(data.url);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const json of state.done) {
|
|
|
|
const data = JSON.parse(json);
|
|
|
|
if (data.failed) {
|
|
|
|
this.queue.push(data);
|
|
|
|
} else {
|
|
|
|
this.done.push(data);
|
|
|
|
}
|
|
|
|
this.seenList.add(data.url);
|
|
|
|
}
|
|
|
|
|
|
|
|
return this.seenList.size;
|
|
|
|
}
|
|
|
|
|
|
|
|
async numDone() {
|
|
|
|
return this.done.length;
|
|
|
|
}
|
|
|
|
|
|
|
|
async numRealSeen() {
|
|
|
|
return this.seenList.size;
|
|
|
|
}
|
|
|
|
|
|
|
|
async numPending() {
|
|
|
|
return this.pending.size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
// ============================================================================
|
|
|
|
class RedisTaskCallbacks
|
|
|
|
{
|
|
|
|
constructor(json, state) {
|
|
|
|
this.state = state;
|
|
|
|
this.json = json;
|
|
|
|
this.data = JSON.parse(json);
|
|
|
|
}
|
|
|
|
|
|
|
|
async start() {
|
|
|
|
console.log("Start");
|
|
|
|
this.json = await this.state._markStarted(this.json);
|
|
|
|
console.log("Started: " + this.json);
|
|
|
|
}
|
|
|
|
|
|
|
|
async resolve() {
|
|
|
|
// atomically move from pending set -> done list while adding finished timestamp
|
|
|
|
await this.state._finish(this.json);
|
|
|
|
}
|
|
|
|
|
|
|
|
async reject(e) {
|
|
|
|
console.warn(`URL Load Failed: ${this.data.url}, Reason: ${e}`);
|
|
|
|
await this.state._fail(this.json);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
class RedisCrawlState extends BaseState
|
|
|
|
{
|
|
|
|
constructor(redis, key, pageTimeout) {
|
|
|
|
super();
|
|
|
|
this.redis = redis;
|
|
|
|
|
|
|
|
this.key = key;
|
|
|
|
this.pageTimeout = pageTimeout / 1000;
|
|
|
|
|
|
|
|
this.qkey = this.key + ":q";
|
|
|
|
this.pkey = this.key + ":p";
|
|
|
|
this.skey = this.key + ":s";
|
|
|
|
this.dkey = this.key + ":d";
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
redis.defineCommand("addqueue", {
|
|
|
|
numberOfKeys: 2,
|
|
|
|
lua: "redis.call('srem', KEYS[1], ARGV[1]); redis.call('lpush', KEYS[2], ARGV[1])"
|
|
|
|
});
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
|
|
|
|
redis.defineCommand("markpending", {
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
numberOfKeys: 2,
|
2022-02-08 15:31:55 -08:00
|
|
|
lua: "local json = redis.call('rpop', KEYS[1]); redis.call('sadd', KEYS[2], json); return json"
|
|
|
|
});
|
|
|
|
|
|
|
|
redis.defineCommand("markstarted", {
|
|
|
|
numberOfKeys: 1,
|
|
|
|
lua: "local json = ARGV[1]; if (redis.call('srem', KEYS[1], json)) then local data = cjson.decode(json); data['started'] = ARGV[2]; json = cjson.encode(data); redis.call('sadd', KEYS[1], json); end; return json"
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
redis.defineCommand("movefinished", {
|
|
|
|
numberOfKeys: 2,
|
2022-02-08 15:31:55 -08:00
|
|
|
lua: "local json = ARGV[1]; if (redis.call('srem', KEYS[1], json)) then local data = cjson.decode(json); data[ARGV[3]] = ARGV[2]; json = cjson.encode(data); redis.call('lpush', KEYS[2], json); end; return json"
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
async _markPending() {
|
|
|
|
return await this.redis.markpending(this.qkey, this.pkey);
|
|
|
|
}
|
|
|
|
|
|
|
|
async _markStarted(json) {
|
|
|
|
const started = new Date().toISOString();
|
|
|
|
|
|
|
|
return await this.redis.markstarted(this.pkey, json, started);
|
|
|
|
}
|
|
|
|
|
|
|
|
async _finish(json) {
|
|
|
|
const finished = new Date().toISOString();
|
|
|
|
|
|
|
|
return await this.redis.movefinished(this.pkey, this.dkey, json, finished, "finished");
|
|
|
|
}
|
|
|
|
|
|
|
|
async _fail(json) {
|
|
|
|
return await this.redis.movefinished(this.pkey, this.dkey, json, true, "failed");
|
|
|
|
}
|
|
|
|
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
async push(job) {
|
2022-02-08 15:31:55 -08:00
|
|
|
//await this.redis.lpush(this.qkey, JSON.stringify(job.data));
|
|
|
|
await this.redis.addqueue(this.pkey, this.qkey, JSON.stringify(job.data));
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
async realSize() {
|
|
|
|
return await this.redis.llen(this.qkey);
|
|
|
|
}
|
|
|
|
|
|
|
|
async shift() {
|
2022-02-08 15:31:55 -08:00
|
|
|
const json = await this._markPending();
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
|
2022-02-08 15:31:55 -08:00
|
|
|
const callback = new RedisTaskCallbacks(json, this);
|
|
|
|
|
|
|
|
return new Job(callback.data, undefined, callback);
|
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78)
* save state work:
- support interrupting and saving crawl
- support loading crawl state (frontier queue, pending, done) from YAML
- support scope check when loading to apply new scoping rules when restarting crawl
- failed urls added to done as failed, can be retried if crawl is stopped and restarted
- save state to crawls/crawl-<ts>-<id>.yaml when interrupted
- --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never.
- support in-memory or redis based crawl state, using fork of puppeteer-cluster
- --redisStore used to enable redis-based state
* signals/crawl interruption:
- crawl state set to drain/not provide any more urls to crawl
- graceful stop of crawl in response to sigint/sigterm
- initial sigint/sigterm waits for graceful end of current pages, second terminates immediately
- initial sigabrt followed by sigterm terminates immediately
- puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT
* redis state support:
- use lua scripts for atomic move from queue -> pending, and pending -> done
- pending key expiry set to page timeout
- add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination
- drainMax returns the numPending() + numSeen() to work with cluster stats
* arg improvements:
- add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file)
- support setting cmdline args via env var CRAWL_ARGS
- use 'choices' in args when possible
* build update:
- switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds
- use setuptools<58.0
* misc crawl/scoping rule fixes:
- scoping rules fix when external is used with scopeType
state:
- limit: ensure no urls, including initial seeds, are added past the limit
- signals: fix immediate shutdown on second signal
- tests: add scope test for default scope + excludes
* py-wacz update
- add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2)
- pywb: use latest pywb branch for improved twitter video capture
* update to latest browsertrix-behaviors
* fix setuptools dependency #88
* update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
async has(url) {
|
|
|
|
return !!await this.redis.sismember(this.skey, url);
|
|
|
|
}
|
|
|
|
|
|
|
|
async add(url) {
|
|
|
|
return await this.redis.sadd(this.skey, url);
|
|
|
|
}
|
|
|
|
|
|
|
|
async serialize() {
|
|
|
|
const queued = await this.redis.lrange(this.qkey, 0, -1);
|
|
|
|
const pending = await this.redis.smembers(this.pkey);
|
|
|
|
const done = await this.redis.lrange(this.dkey, 0, -1);
|
|
|
|
|
|
|
|
return {queued, pending, done};
|
|
|
|
}
|
|
|
|
|
|
|
|
async load(state, seeds, checkScope) {
|
|
|
|
const seen = [];
|
|
|
|
|
|
|
|
// need to delete existing keys, if exist to fully reset state
|
|
|
|
await this.redis.del(this.qkey);
|
|
|
|
await this.redis.del(this.pkey);
|
|
|
|
await this.redis.del(this.dkey);
|
|
|
|
await this.redis.del(this.skey);
|
|
|
|
|
|
|
|
for (const json of state.queued) {
|
|
|
|
const data = JSON.parse(json);
|
|
|
|
if (checkScope) {
|
|
|
|
if (!this.recheckScope(data, seeds)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
await this.redis.rpush(this.qkey, json);
|
|
|
|
seen.push(data.url);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const json of state.pending) {
|
|
|
|
const data = JSON.parse(json);
|
|
|
|
if (checkScope) {
|
|
|
|
if (!this.recheckScope(data, seeds)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
await this.redis.rpush(this.qkey, json);
|
|
|
|
seen.push(data.url);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const json of state.done) {
|
|
|
|
const data = JSON.parse(json);
|
|
|
|
if (data.failed) {
|
|
|
|
await this.redis.rpush(this.qkey, json);
|
|
|
|
} else {
|
|
|
|
await this.redis.rpush(this.dkey, json);
|
|
|
|
}
|
|
|
|
seen.push(data.url);
|
|
|
|
}
|
|
|
|
|
|
|
|
await this.redis.sadd(this.skey, seen);
|
|
|
|
return seen.length;
|
|
|
|
}
|
|
|
|
|
|
|
|
async numDone() {
|
|
|
|
return await this.redis.llen(this.dkey);
|
|
|
|
}
|
|
|
|
|
|
|
|
async numRealSeen() {
|
|
|
|
return await this.redis.scard(this.skey);
|
|
|
|
}
|
|
|
|
|
|
|
|
async numPending() {
|
|
|
|
return await this.redis.scard(this.pkey);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports.RedisCrawlState = RedisCrawlState;
|
|
|
|
module.exports.MemoryCrawlState = MemoryCrawlState;
|