From: xie xingguo Date: Wed, 8 Aug 2018 09:52:29 +0000 (+0800) Subject: osd/OSD.cc: choose heartbeat peers by failure domain X-Git-Tag: v14.0.1~633^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=bcc11541b86762d5821b24ef220b6ef079da2515;p=ceph.git osd/OSD.cc: choose heartbeat peers by failure domain By default, monitor requires at least two valid failure votes/reports from different hosts to mark an OSD down, which turns out to be impossible sometimes for a replicated-pool of size of 2 in those clusters made up of hosts with contiguous labeled OSDs. This patch instead does a breadth-first search based on the highest level of failure domain at cluster-wide, to try to make heartbeat peers can cover all failure domains whenever possible, which can hopefully help accelerating osd failure detection in the above case.. Signed-off-by: xie xingguo --- diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index e3747859ccd5d..58e44da918368 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -1524,6 +1524,19 @@ int CrushWrapper::get_parent_of_type(int item, int type, int rule) const return 0; // not found } +void CrushWrapper::get_subtree_of_type(int type, vector *subtrees) +{ + set roots; + find_roots(&roots); + for (auto r: roots) { + crush_bucket *b = get_bucket(r); + if (IS_ERR(b)) + continue; + get_children_of_type(b->id, type, subtrees); + } +} + + int CrushWrapper::rename_class(const string& srcname, const string& dstname) { auto i = class_rname.find(srcname); diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 36f8012fd87a7..40a0f1be19d25 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -741,6 +741,10 @@ public: int type, vector *children, bool exclude_shadow = true) const; + /** + * enumerate all subtrees by type + */ + void get_subtree_of_type(int type, vector *subtrees); /** * get failure-domain type of a specific crush rule diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 86e06dd561995..d699f1f63045f 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4353,6 +4353,13 @@ void OSD::maybe_update_heartbeat_peers() if (prev >= 0 && prev != next) want.insert(prev); + // make sure we have at least **min_down** osds coming from different + // subtree level (e.g., hosts) for fast failure detection. + auto min_down = cct->_conf.get_val("mon_osd_min_down_reporters"); + auto subtree = cct->_conf.get_val("mon_osd_reporter_subtree_level"); + osdmap->get_random_up_osds_by_subtree( + whoami, subtree, min_down, want, &want); + for (set::iterator p = want.begin(); p != want.end(); ++p) { dout(10) << " adding neighbor peer osd." << *p << dendl; extras.insert(*p); diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 93550dc6e4da0..2fa444351f727 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -5185,3 +5185,40 @@ int OSDMap::parse_osd_id_list(const vector& ls, set *out, } return 0; } + +void OSDMap::get_random_up_osds_by_subtree(int n, // whoami + string &subtree, + int limit, // how many + set skip, + set *want) const { + if (limit <= 0) + return; + int subtree_type = crush->get_type_id(subtree); + if (subtree_type < 1) + return; + vector subtrees; + crush->get_subtree_of_type(subtree_type, &subtrees); + std::random_shuffle(subtrees.begin(), subtrees.end()); + for (auto s : subtrees) { + if (limit <= 0) + break; + if (crush->subtree_contains(s, n)) + continue; + vector osds; + crush->get_children_of_type(s, 0, &osds); + if (osds.empty()) + continue; + vector up_osds; + for (auto o : osds) { + if (is_up(o) && !skip.count(o)) + up_osds.push_back(o); + } + if (up_osds.empty()) + continue; + auto it = up_osds.begin(); + std::advance(it, (n % up_osds.size())); + want->insert(*it); + --limit; + } +} + diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 29715e73479fe..8e510a3b10189 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -973,6 +973,13 @@ public: return -1; } + + void get_random_up_osds_by_subtree(int n, // whoami + string &subtree, + int limit, // how many + set skip, + set *want) const; + /** * get feature bits required by the current structure *