From: Xiaoxi Chen Date: Wed, 11 Nov 2015 07:08:35 +0000 (+0800) Subject: mon: support min_down_reporter conuted by subtree level X-Git-Tag: v10.0.2~54^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bcb8f362ec6ac47c4908118e7860dec7971d001f;p=ceph.git mon: support min_down_reporter conuted by subtree level In many case OSDs in an isolated(public/cluster connection lost but osd<->mon is good) node will report other OSD down to monitor,which usually wrongly mark someone down. Nowaday the "osd_min_down_reporters", we would like to extend the semantic to allow it counted by host or rack, thus user could require failure reports from at least two nodes to mark an OSD down, which shoudl prevent an isoloated host make trouble to the cluster. Signed-off-by: Xiaoxi Chen --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a19cc5d8ff78..102bc19d94fb 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -276,7 +276,8 @@ OPTION(mon_sync_debug_leader, OPT_INT, -1) // monitor to be used as the sync lea OPTION(mon_sync_debug_provider, OPT_INT, -1) // monitor to be used as the sync provider OPTION(mon_sync_debug_provider_fallback, OPT_INT, -1) // monitor to be used as fallback if sync provider fails OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0) // inject N second delay on each get_chunk request -OPTION(mon_osd_min_down_reporters, OPT_INT, 2) // number of OSDs who need to report a down OSD for it to count +OPTION(mon_osd_min_down_reporters, OPT_INT, 2) // number of OSDs from different subtrees who need to report a down OSD for it to count +OPTION(mon_osd_reporter_subtree_level , OPT_STR, "host") // in which level of parent bucket the reporters are counted OPTION(mon_osd_force_trim_to, OPT_INT, 0) // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care) OPTION(mon_mds_force_trim_to, OPT_INT, 0) // force mon to trim mdsmaps to this point (dangerous, use with care) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c3c684f6ce5d..39cb3a505f1a 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1635,6 +1635,8 @@ void OSDMonitor::check_failures(utime_t now) bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) { + set reporters_by_subtree; + string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level; utime_t orig_grace(g_conf->osd_heartbeat_grace, 0); utime_t max_failed_since = fi.get_failed_since(); utime_t failed_for = now - max_failed_since; @@ -1663,6 +1665,16 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) for (map::iterator p = fi.reporters.begin(); p != fi.reporters.end(); ++p) { + // get the parent bucket whose type matches with "reporter_subtree_level". + // fall back to OSD if the level doesn't exist. + map reporter_loc = osdmap.crush->get_full_location(p->first); + map::iterator iter = reporter_loc.find(reporter_subtree_level); + if (iter == reporter_loc.end()) { + reporters_by_subtree.insert("osd." + to_string(p->first)); + } else { + reporters_by_subtree.insert(iter->second); + } + const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); utime_t elapsed = now - xi.down_stamp; double decay = exp((double)elapsed * decay_k); @@ -1685,15 +1697,17 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) return true; } + if (failed_for >= grace && - ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters)) { + (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) { dout(1) << " we have enough reporters to mark osd." << target_osd << " down" << dendl; pending_inc.new_state[target_osd] = CEPH_OSD_UP; mon->clog->info() << osdmap.get_inst(target_osd) << " failed (" - << (int)fi.reporters.size() << " reporters after " - << failed_for << " >= grace " << grace << ")\n"; + << (int)reporters_by_subtree.size() << " reporters from different " + << reporter_subtree_level << " after " + << failed_for << " >= grace " << grace << ")\n"; return true; } return false;