From: Kefu Chai Date: Thu, 11 Mar 2021 11:49:36 +0000 (+0800) Subject: mon/OSDMonitor: extract get_grace_time() X-Git-Tag: v16.2.5~118^2~2^2~4 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=84fd33e7078da034c2b105b8c277d551ebbc31fd;p=ceph.git mon/OSDMonitor: extract get_grace_time() for better readability Signed-off-by: Kefu Chai (cherry picked from commit d42815d5e9c4ba781ea710ef299cb9319f7fc3e6) --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index ff3f9d0b5e65e..dbba6cab33cc5 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3189,6 +3189,53 @@ bool OSDMonitor::check_failures(utime_t now) return found_failure; } +utime_t OSDMonitor::get_grace_time(utime_t now, + int target_osd, + failure_info_t& fi) const +{ + utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0); + if (!g_conf()->mon_osd_adjust_heartbeat_grace) { + return orig_grace; + } + utime_t grace = orig_grace; + double halflife = (double)g_conf()->mon_osd_laggy_halflife; + double decay_k = ::log(.5) / halflife; + + // scale grace period based on historical probability of 'lagginess' + // (false positive failures due to slowness). + const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd); + const utime_t failed_for = now - fi.get_failed_since(); + double decay = exp((double)failed_for * decay_k); + dout(20) << " halflife " << halflife << " decay_k " << decay_k + << " failed_for " << failed_for << " decay " << decay << dendl; + double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; + grace += my_grace; + + // consider the peers reporting a failure a proxy for a potential + // 'subcluster' over the overall cluster that is similarly + // laggy. this is clearly not true in all cases, but will sometimes + // help us localize the grace correction to a subset of the system + // (say, a rack with a bad switch) that is unhappy. + double peer_grace = 0; + for (auto& [reporter, report] : fi.reporters) { + if (osdmap.exists(reporter)) { + const osd_xinfo_t& xi = osdmap.get_xinfo(reporter); + utime_t elapsed = now - xi.down_stamp; + double decay = exp((double)elapsed * decay_k); + peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + } + } + peer_grace /= (double)fi.reporters.size(); + grace += peer_grace; + dout(10) << " osd." << target_osd << " has " + << fi.reporters.size() << " reporters, " + << grace << " grace (" << orig_grace << " + " << my_grace + << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since() + << dendl; + + return grace; +} + bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) { // already pending failure? @@ -3200,32 +3247,6 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) set reporters_by_subtree; auto reporter_subtree_level = g_conf().get_val("mon_osd_reporter_subtree_level"); - utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0); - utime_t max_failed_since = fi.get_failed_since(); - utime_t failed_for = now - max_failed_since; - - utime_t grace = orig_grace; - double my_grace = 0, peer_grace = 0; - double decay_k = 0; - if (g_conf()->mon_osd_adjust_heartbeat_grace) { - double halflife = (double)g_conf()->mon_osd_laggy_halflife; - decay_k = ::log(.5) / halflife; - - // scale grace period based on historical probability of 'lagginess' - // (false positive failures due to slowness). - const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd); - double decay = exp((double)failed_for * decay_k); - dout(20) << " halflife " << halflife << " decay_k " << decay_k - << " failed_for " << failed_for << " decay " << decay << dendl; - my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; - grace += my_grace; - } - - // consider the peers reporting a failure a proxy for a potential - // 'subcluster' over the overall cluster that is similarly - // laggy. this is clearly not true in all cases, but will sometimes - // help us localize the grace correction to a subset of the system - // (say, a rack with a bad switch) that is unhappy. ceph_assert(fi.reporters.size()); for (auto p = fi.reporters.begin(); p != fi.reporters.end();) { // get the parent bucket whose type matches with "reporter_subtree_level". @@ -3238,32 +3259,18 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) } else { reporters_by_subtree.insert(iter->second); } - if (g_conf()->mon_osd_adjust_heartbeat_grace) { - const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); - utime_t elapsed = now - xi.down_stamp; - double decay = exp((double)elapsed * decay_k); - peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; - } ++p; } else { fi.cancel_report(p->first);; p = fi.reporters.erase(p); } } - - if (g_conf()->mon_osd_adjust_heartbeat_grace) { - peer_grace /= (double)fi.reporters.size(); - grace += peer_grace; + if (reporters_by_subtree.size() < g_conf().get_val("mon_osd_min_down_reporters")) { + return false; } - - dout(10) << " osd." << target_osd << " has " - << fi.reporters.size() << " reporters, " - << grace << " grace (" << orig_grace << " + " << my_grace - << " + " << peer_grace << "), max_failed_since " << max_failed_since - << dendl; - - if (failed_for >= grace && - reporters_by_subtree.size() >= g_conf().get_val("mon_osd_min_down_reporters")) { + const utime_t failed_for = now - fi.get_failed_since(); + const utime_t grace = get_grace_time(now, target_osd, fi); + if (failed_for >= grace) { dout(1) << " we have enough reporters to mark osd." << target_osd << " down" << dendl; pending_inc.new_state[target_osd] = CEPH_OSD_UP; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index fe179e5db5926..23d8d4ab1ed56 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -236,6 +236,7 @@ public: bool check_failures(utime_t now); bool check_failure(utime_t now, int target_osd, failure_info_t& fi); + utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const; void force_failure(int target_osd, int by); bool _have_pending_crush();