From f6b2f79c39ab91b0f2daca73a7838b98bb56c6d8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 5 Sep 2012 13:16:21 -0700 Subject: [PATCH] mon: make heartbeat grace and down out interval scaling optional Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 + src/mon/OSDMonitor.cc | 79 ++++++++++++++++++++++------------------ 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 80fbe9752063a..05376f433bfce 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -106,6 +106,8 @@ OPTION(mon_tick_interval, OPT_INT, 5) OPTION(mon_subscribe_interval, OPT_DOUBLE, 300) OPTION(mon_osd_laggy_halflife, OPT_INT, 60*60) // (seconds) how quickly our laggy estimations decay OPTION(mon_osd_laggy_weight, OPT_DOUBLE, .3) // weight for new 'samples's in laggy estimations +OPTION(mon_osd_adjust_heartbeat_grace, OPT_BOOL, true) // true if we should scale based on laggy estimations +OPTION(mon_osd_adjust_down_out_interval, OPT_BOOL, true) // true if we should scale based on laggy estimations OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds 'in' OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in' OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in' diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7f92ba7a07dd8..223c5aa04c917 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -688,35 +688,38 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) utime_t max_failed_since = fi.get_failed_since(); utime_t failed_for = now - max_failed_since; - double halflife = (double)g_conf->mon_osd_laggy_halflife; - double decay_k = ::log(.5) / halflife; - - // scale grace period based on historical probability of 'lagginess' - // (false positive failures due to slowness). - const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd); - double decay = exp((double)failed_for * decay_k); - dout(20) << " halflife " << halflife << " decay_k " << decay_k - << " failed_for " << failed_for << " decay " << decay << dendl; utime_t grace = orig_grace; - double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; - grace += my_grace; - - // consider the peers reporting a failure a proxy for a potential - // 'subcluster' over the overall cluster that is similarly - // laggy. this is clearly not true in all cases, but will sometimes - // help us localize the grace correction to a subset of the system - // (say, a rack with a bad switch) that is unhappy. - assert(fi.reporters.size()); - for (map::iterator p = fi.reporters.begin(); - p != fi.reporters.end(); - p++) { - const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); - utime_t elapsed = now - xi.down_stamp; - double decay = exp((double)elapsed * decay_k); - peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + double my_grace = 0, peer_grace = 0; + if (g_conf->mon_osd_adjust_heartbeat_grace) { + double halflife = (double)g_conf->mon_osd_laggy_halflife; + double decay_k = ::log(.5) / halflife; + + // scale grace period based on historical probability of 'lagginess' + // (false positive failures due to slowness). + const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd); + double decay = exp((double)failed_for * decay_k); + dout(20) << " halflife " << halflife << " decay_k " << decay_k + << " failed_for " << failed_for << " decay " << decay << dendl; + my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; + grace += my_grace; + + // consider the peers reporting a failure a proxy for a potential + // 'subcluster' over the overall cluster that is similarly + // laggy. this is clearly not true in all cases, but will sometimes + // help us localize the grace correction to a subset of the system + // (say, a rack with a bad switch) that is unhappy. + assert(fi.reporters.size()); + for (map::iterator p = fi.reporters.begin(); + p != fi.reporters.end(); + p++) { + const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); + utime_t elapsed = now - xi.down_stamp; + double decay = exp((double)elapsed * decay_k); + peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + } + peer_grace /= (double)fi.reporters.size(); + grace += peer_grace; } - peer_grace /= (double)fi.reporters.size(); - grace += peer_grace; dout(10) << " osd." << target_osd << " has " << fi.reporters.size() << " reporters and " @@ -1397,17 +1400,21 @@ void OSDMonitor::tick() if (osdmap.is_down(o) && osdmap.is_in(o) && can_mark_out(o)) { - // scale grace period the same way we do the heartbeat grace. - const osd_xinfo_t& xi = osdmap.get_xinfo(o); utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0); - double halflife = (double)g_conf->mon_osd_laggy_halflife; - double decay_k = ::log(.5) / halflife; - double decay = exp((double)down * decay_k); - dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k - << " down for " << down << " decay " << decay << dendl; - double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; utime_t grace = orig_grace; - grace += my_grace; + double my_grace = 0.0; + + if (g_conf->mon_osd_adjust_down_out_interval) { + // scale grace period the same way we do the heartbeat grace. + const osd_xinfo_t& xi = osdmap.get_xinfo(o); + double halflife = (double)g_conf->mon_osd_laggy_halflife; + double decay_k = ::log(.5) / halflife; + double decay = exp((double)down * decay_k); + dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k + << " down for " << down << " decay " << decay << dendl; + my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; + grace += my_grace; + } if (g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace) { -- 2.39.5