From: Joao Eduardo Luis Date: Mon, 28 Jan 2013 19:27:31 +0000 (+0000) Subject: mon: Monitor: rework timecheck code to clarify logic boundaries X-Git-Tag: v0.57~93^2~3^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0ded0fdf58b80ba252d3faf5da99d4be384528db;p=ceph.git mon: Monitor: rework timecheck code to clarify logic boundaries The initial timecheck implementation relied on a cleanup function to clean the state each time we changed epochs (or we got out of quorum), and we would have to clean up the state in-between rounds in a potentially confusing way some time down the line. This patch creates logic boundaries in the code flow, making it clear where we set up or clear the state when we start or finish an epoch, and where we set up or clear the round state in-between rounds. It also allowed for some other changes in behavior, such as when we set-up the timecheck event, or when we cancel it. Despite the slight increase in size, the mechanism just got more easily understandable than it was before. Signed-off-by: Joao Eduardo Luis --- diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 5209e4cb4706..699db8968f17 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -736,7 +736,7 @@ void Monitor::reset() { dout(10) << "reset" << dendl; - timecheck_cleanup(); + timecheck_finish(); leader_since = utime_t(); if (!quorum.empty()) { @@ -1189,7 +1189,7 @@ void Monitor::win_election(epoch_t epoch, set& active, uint64_t features) finish_election(); if (monmap->size() > 1) - timecheck(); + timecheck_start(); } void Monitor::lose_election(epoch_t epoch, set &q, int l, uint64_t features) @@ -1213,7 +1213,7 @@ void Monitor::lose_election(epoch_t epoch, set &q, int l, uint64_t features void Monitor::finish_election() { - timecheck_cleanup(); + timecheck_finish(); exited_quorum = utime_t(); finish_contexts(g_ceph_context, waitfor_quorum); finish_contexts(g_ceph_context, maybe_wait_for_quorum); @@ -2241,18 +2241,98 @@ bool Monitor::_ms_dispatch(Message *m) return ret; } +void Monitor::timecheck_start() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); + timecheck_start_round(); +} + +void Monitor::timecheck_finish() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); +} + +void Monitor::timecheck_start_round() +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(is_leader()); + + if (monmap->size() == 1) { + assert(0 == "We are alone; this shouldn't have been scheduled!"); + return; + } + + if (timecheck_round % 2) { + dout(10) << __func__ << " there's a timecheck going on" << dendl; + utime_t curr_time = ceph_clock_now(g_ceph_context); + double max = g_conf->mon_timecheck_interval*3; + if (curr_time - timecheck_round_start > max) { + dout(10) << __func__ << " keep current round going" << dendl; + goto out; + } else { + dout(10) << __func__ + << " finish current timecheck and start new" << dendl; + timecheck_cancel_round(); + } + } + + assert(timecheck_round % 2 == 0); + timecheck_acks = 0; + timecheck_round ++; + timecheck_round_start = ceph_clock_now(g_ceph_context); + dout(10) << __func__ << " new " << timecheck_round << dendl; + + timecheck(); +out: + dout(10) << __func__ << " setting up next event" << dendl; + timecheck_event = new C_TimeCheck(this); + timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event); +} + +void Monitor::timecheck_finish_round(bool success) +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(timecheck_round % 2); + timecheck_round ++; + timecheck_round_start = utime_t(); + + if (success) { + assert(timecheck_waiting.size() == 0); + assert(timecheck_acks == quorum.size()); + timecheck_report(); + return; + } + + dout(10) << __func__ << " " << timecheck_waiting.size() + << " peers still waiting:"; + for (map::iterator p = timecheck_waiting.begin(); + p != timecheck_waiting.end(); ++p) { + *_dout << " " << p->first.name; + } + *_dout << dendl; + timecheck_waiting.clear(); + + dout(10) << __func__ << " finished to " << timecheck_round << dendl; +} + +void Monitor::timecheck_cancel_round() +{ + timecheck_finish_round(false); +} + void Monitor::timecheck_cleanup() { timecheck_round = 0; timecheck_acks = 0; + timecheck_round_start = utime_t(); if (timecheck_event) { timer.cancel_event(timecheck_event); timecheck_event = NULL; } - - if (timecheck_waiting.size() > 0) - timecheck_waiting.clear(); + timecheck_waiting.clear(); timecheck_skews.clear(); timecheck_latencies.clear(); } @@ -2301,20 +2381,12 @@ void Monitor::timecheck() { dout(10) << __func__ << dendl; assert(is_leader()); - if (monmap->size() == 1) { - assert(0 == "We are alone; this shouldn't have been scheduled!"); + assert(0 == "We are alone; we shouldn't have gotten here!"); return; } + assert(timecheck_round % 2 != 0); - if ((timecheck_round % 2) != 0) { - dout(15) << __func__ - << " timecheck still in progress; laggy monitors maybe?" - << dendl; - goto out; - } - - timecheck_round++; timecheck_acks = 1; // we ack ourselves dout(10) << __func__ << " start timecheck epoch " << get_epoch() @@ -2337,12 +2409,6 @@ void Monitor::timecheck() dout(10) << __func__ << " send " << *m << " to " << inst << dendl; messenger->send_message(m, inst); } - -out: - dout(10) << __func__ << " setting up next event and timeout" << dendl; - timecheck_event = new C_TimeCheck(this); - - timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event); } health_status_t Monitor::timecheck_status(ostringstream &ss, @@ -2395,9 +2461,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m) dout(1) << __func__ << " our clock was readjusted --" << " bump round and drop current check" << dendl; - timecheck_round++; - timecheck_acks = 0; - timecheck_waiting.clear(); + timecheck_cancel_round(); return; } @@ -2482,8 +2546,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m) assert(timecheck_skews.size() == timecheck_acks); assert(timecheck_waiting.size() == 0); // everyone has acked, so bump the round to finish it. - timecheck_round++; - timecheck_report(); + timecheck_finish_round(); } } diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 9716e351348e..c7704bb16da3 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -238,6 +238,7 @@ private: // finished. version_t timecheck_round; unsigned int timecheck_acks; + utime_t timecheck_round_start; /** * Time Check event. */ @@ -247,10 +248,15 @@ private: Monitor *mon; C_TimeCheck(Monitor *m) : mon(m) { } void finish(int r) { - mon->timecheck(); + mon->timecheck_start_round(); } }; + void timecheck_start(); + void timecheck_finish(); + void timecheck_start_round(); + void timecheck_finish_round(bool success = true); + void timecheck_cancel_round(); void timecheck_cleanup(); void timecheck_report(); void timecheck();