OPTION(mon_data_avail_warn, OPT_INT, 30)
OPTION(mon_data_size_warn, OPT_U64, 15*1024*1024*1024) // issue a warning when the monitor's data store goes over 15GB (in bytes)
OPTION(mon_scrub_interval, OPT_INT, 3600*24) // once a day
+OPTION(mon_scrub_timeout, OPT_INT, 60*5) // let's give it 5 minutes; why not.
OPTION(mon_scrub_max_keys, OPT_INT, 100) // max number of keys to scrub each time
OPTION(mon_scrub_inject_crc_mismatch, OPT_DOUBLE, 0.0) // probability of injected crc mismatch [0.0, 1.0]
OPTION(mon_scrub_inject_missing_keys, OPT_DOUBLE, 0.0) // probability of injected missing keys [0.0, 1.0]
// scrub
scrub_version(0),
scrub_event(NULL),
+ scrub_timeout_event(NULL),
// sync state
sync_provider_count(0),
assert(is_leader());
assert(scrub_state);
+ scrub_cancel_timeout();
wait_for_paxos_write();
scrub_version = paxos->get_version();
+
// scrub all keys if we're the only monitor in the quorum
int32_t num_keys =
(quorum.size() == 1 ? -1 : cct->_conf->mon_scrub_max_keys);
scrub_state->finished = !r;
+ // only after we got our scrub results do we really care whether the
+ // other monitors are late on their results. Also, this way we avoid
+ // triggering the timeout if we end up getting stuck in _scrub() for
+ // longer than the duration of the timeout.
+ scrub_reset_timeout();
+
if (quorum.size() == 1) {
assert(scrub_state->finished == true);
scrub_finish();
break;
if (m->version != scrub_version)
break;
+ // reset the timeout each time we get a result
+ scrub_reset_timeout();
+
int from = m->get_source().num();
assert(scrub_result.count(from) == 0);
scrub_result[from] = m->result;
clog->info() << "scrub ok on " << quorum << ": " << mine << "\n";
}
+inline void Monitor::scrub_timeout()
+{
+ dout(1) << __func__ << " restarting scrub" << dendl;
+ scrub_reset();
+ scrub_start();
+}
+
void Monitor::scrub_finish()
{
dout(10) << __func__ << dendl;
void Monitor::scrub_reset()
{
dout(10) << __func__ << dendl;
+ scrub_cancel_timeout();
scrub_version = 0;
scrub_result.clear();
scrub_state.reset();
}
}
+inline void Monitor::scrub_cancel_timeout()
+{
+ if (scrub_timeout_event) {
+ timer.cancel_event(scrub_timeout_event);
+ scrub_timeout_event = NULL;
+ }
+}
+
+void Monitor::scrub_reset_timeout()
+{
+ dout(15) << __func__ << " reset timeout event" << dendl;
+ scrub_cancel_timeout();
+ scrub_timeout_event = new C_ScrubTimeout(this);
+ timer.add_event_after(g_conf->mon_scrub_timeout, scrub_timeout_event);
+}
+
/************ TICK ***************/
class C_Mon_Tick : public Context {
pair<string,string> *start,
int *num_keys);
void scrub_check_results();
+ void scrub_timeout();
void scrub_finish();
void scrub_reset();
mon->scrub_start();
}
};
+ struct C_ScrubTimeout : public Context {
+ Monitor *mon;
+ C_ScrubTimeout(Monitor *m) : mon(m) { }
+ void finish(int r) {
+ mon->scrub_timeout();
+ }
+ };
Context *scrub_event; ///< periodic event to trigger scrub (leader)
+ Context *scrub_timeout_event; ///< scrub round timeout (leader)
void scrub_event_start();
void scrub_event_cancel();
+ void scrub_reset_timeout();
+ void scrub_cancel_timeout();
struct ScrubState {
pair<string,string> last_key; ///< last scrubbed key