]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSD.cc: choose heartbeat peers by failure domain
authorxie xingguo <xie.xingguo@zte.com.cn>
Wed, 8 Aug 2018 09:52:29 +0000 (17:52 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Thu, 9 Aug 2018 00:44:58 +0000 (08:44 +0800)
By default, monitor requires at least two valid failure votes/reports from
different hosts to mark an OSD down, which turns out to be impossible sometimes
for a replicated-pool of size of 2 in those clusters made up of hosts
with contiguous labeled OSDs.

This patch instead does a breadth-first search based on the highest level of
failure domain at cluster-wide, to try to make heartbeat peers can cover all failure domains
whenever possible, which can hopefully help accelerating osd failure detection
in the above case..

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
src/crush/CrushWrapper.cc
src/crush/CrushWrapper.h
src/osd/OSD.cc
src/osd/OSDMap.cc
src/osd/OSDMap.h

index e3747859ccd5df170d9c5cd650c9030e3bf20fe3..58e44da918368386bd5e71189e0215f6fbaf3d9c 100644 (file)
@@ -1524,6 +1524,19 @@ int CrushWrapper::get_parent_of_type(int item, int type, int rule) const
   return 0; // not found
 }
 
+void CrushWrapper::get_subtree_of_type(int type, vector<int> *subtrees)
+{
+  set<int> roots;
+  find_roots(&roots);
+  for (auto r: roots) {
+    crush_bucket *b = get_bucket(r);
+    if (IS_ERR(b))
+      continue;
+    get_children_of_type(b->id, type, subtrees);
+  }
+}
+
+
 int CrushWrapper::rename_class(const string& srcname, const string& dstname)
 {
   auto i = class_rname.find(srcname);
index 36f8012fd87a7794af07a98314662cc5d4d359d9..40a0f1be19d256a2b3c691df39b354ff12de15c7 100644 (file)
@@ -741,6 +741,10 @@ public:
                             int type,
                            vector<int> *children,
                            bool exclude_shadow = true) const;
+  /**
+   * enumerate all subtrees by type
+   */
+  void get_subtree_of_type(int type, vector<int> *subtrees);
 
   /**
     * get failure-domain type of a specific crush rule
index 86e06dd561995b1b6adca5fe686b617f5b7d02a8..d699f1f63045fa54d8cd609b63ef7fd1e1beee57 100644 (file)
@@ -4353,6 +4353,13 @@ void OSD::maybe_update_heartbeat_peers()
   if (prev >= 0 && prev != next)
     want.insert(prev);
 
+  // make sure we have at least **min_down** osds coming from different
+  // subtree level (e.g., hosts) for fast failure detection.
+  auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
+  auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
+  osdmap->get_random_up_osds_by_subtree(
+    whoami, subtree, min_down, want, &want);
+
   for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
     dout(10) << " adding neighbor peer osd." << *p << dendl;
     extras.insert(*p);
index 93550dc6e4da07f01f1a96c3c59e336f7ee78ed5..2fa444351f7278f9622bd645a8baa0abf5b0eb01 100644 (file)
@@ -5185,3 +5185,40 @@ int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
   }
   return 0;
 }
+
+void OSDMap::get_random_up_osds_by_subtree(int n,     // whoami
+                                           string &subtree,
+                                           int limit, // how many
+                                           set<int> skip,
+                                           set<int> *want) const {
+  if (limit <= 0)
+    return;
+  int subtree_type = crush->get_type_id(subtree);
+  if (subtree_type < 1)
+    return;
+  vector<int> subtrees;
+  crush->get_subtree_of_type(subtree_type, &subtrees);
+  std::random_shuffle(subtrees.begin(), subtrees.end());
+  for (auto s : subtrees) {
+    if (limit <= 0)
+      break;
+    if (crush->subtree_contains(s, n))
+      continue;
+    vector<int> osds;
+    crush->get_children_of_type(s, 0, &osds);
+    if (osds.empty())
+      continue;
+    vector<int> up_osds;
+    for (auto o : osds) {
+      if (is_up(o) && !skip.count(o))
+        up_osds.push_back(o);
+    }
+    if (up_osds.empty())
+      continue;
+    auto it = up_osds.begin();
+    std::advance(it, (n % up_osds.size()));
+    want->insert(*it);
+    --limit;
+  }
+}
+
index 29715e73479fefdeb5c68876c981c36badd628d8..8e510a3b10189144a9a2482c201fa7c7f548cccf 100644 (file)
@@ -973,6 +973,13 @@ public:
     return -1;
   }
 
+
+  void get_random_up_osds_by_subtree(int n,     // whoami
+                                     string &subtree,
+                                     int limit, // how many
+                                     set<int> skip,
+                                     set<int> *want) const;
+
   /**
    * get feature bits required by the current structure
    *