]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: subtree-based crush type down health warnings
authorNeha Ojha <nojha@redhat.com>
Fri, 12 May 2017 21:20:26 +0000 (14:20 -0700)
committerNeha Ojha <nojha@redhat.com>
Fri, 12 May 2017 21:20:26 +0000 (14:20 -0700)
Signed-off-by: Neha Ojha <nojha@redhat.com>
src/crush/CrushWrapper.cc
src/crush/CrushWrapper.h
src/mon/OSDMonitor.cc
src/osd/OSDMap.cc
src/osd/OSDMap.h

index 7974b9a75014e698ec91afe9544f57fc4abab4d4..8dd94328f8094256b70136c2def314f66e92b94e 100644 (file)
@@ -587,6 +587,20 @@ int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string>
   return 0;
 }
 
+string CrushWrapper::get_full_location_ordered_string(int id)
+{
+  vector<pair<string, string> > full_location_ordered;
+  string full_location;
+  get_full_location_ordered(id, full_location_ordered);
+  reverse(begin(full_location_ordered), end(full_location_ordered));
+  for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
+    full_location = full_location + i->first + "=" + i->second;
+    if (i != full_location_ordered.end() - 1) {
+      full_location = full_location + ",";
+    }
+  }
+  return full_location;
+}
 
 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
 {
index 54fd572f8cc41a4c4fdbbed982bc36a78a9aea18..6ab6dd55fae9f1b22661129a18381b0ed5f9f9f0 100644 (file)
@@ -590,6 +590,15 @@ public:
    */
   int get_full_location_ordered(int id, vector<pair<string, string> >& path);
 
+  /*
+   * identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
+   * although it returns a concatenated string with the type/name pairs in descending
+   * hierarchical order with format key1=val1,key2=val2.
+   *
+   * returns the location in descending hierarchy as a string.
+   */
+  string get_full_location_ordered_string(int id);
+
   /**
    * returns (type_id, type) of all parent buckets between id and
    * default, can be used to check for anomolous CRUSH maps
index a2a9a9c5a638c0f059592d5dd4b13484e89592a1..a4be13cc20326832cb2b1f9e5454cff9ad456e63 100644 (file)
@@ -3460,15 +3460,14 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
   } else {
     int num_in_osds = 0;
     int num_down_in_osds = 0;
-    int num_in_subtrees = 0;
-    int num_down_in_subtrees = 0;
     set<int> osds;
-    set<int> down_cache;  // quick cache of down subtrees
-    set<int> in_subtrees;
-    set<int> up_in_subtrees;
-    set<int> down_in_subtrees;
-    set<string> down_in_subtree_names;
-    int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
+    set<int> down_in_osds;
+    set<int> up_in_osds;
+    set<int> subtree_up;
+    unordered_map<int, set<int> > subtree_type_down;
+    unordered_map<int, int> num_osds_subtree;
+    int max_type = osdmap.crush->get_num_type_names() - 1;
+
     for (int i = 0; i < osdmap.get_max_osd(); i++) {
       if (!osdmap.exists(i)) {
         if (osdmap.crush->item_exists(i)) {
@@ -3479,57 +3478,97 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       if (osdmap.is_out(i))
         continue;
       ++num_in_osds;
-      // get the id of the parent subtree
-      int subtree_id = osdmap.get_parent_subtree_id(g_ceph_context, i, type, &down_cache);
-      if (subtree_id != -ENOENT) {
-        in_subtrees.insert(subtree_id);
-      }
-
+      if (down_in_osds.count(i) || up_in_osds.count(i))
+       continue;
       if (!osdmap.is_up(i)) {
-       ++num_down_in_osds;
-       if (detail) {
-         const osd_info_t& info = osdmap.get_info(i);
-         ostringstream ss;
-         map<string, string> loc;
-         loc = osdmap.crush->get_full_location(i);
-         ss << "osd." << i << loc << " is down since epoch " << info.down_at << ", last address "
-            << osdmap.get_addr(i);
-         detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+       down_in_osds.insert(i);
+       int parent_id = 0;
+       int current = i;
+       for (int type = 0; type <= max_type; type++) {
+         int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
+         if (r == -ENOENT)
+           break;
+         // break early if this parent is already marked as up
+         if (subtree_up.count(parent_id))
+           break;
+         type = osdmap.crush->get_bucket_type(parent_id);
+         if (!osdmap.subtree_type_is_down(g_ceph_context, parent_id, type, &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
+           break;
+         current = parent_id;
        }
-      } else {
-       // if an osd in a subtree is up, implies subtree is not down
-        up_in_subtrees.insert(subtree_id);
       }
     }
 
-    set_difference(in_subtrees.begin(), in_subtrees.end(),
-                  up_in_subtrees.begin(), up_in_subtrees.end(),
-                  inserter(down_in_subtrees, down_in_subtrees.end()));
-    num_in_subtrees = in_subtrees.size();
-    num_down_in_subtrees = down_in_subtrees.size();
-    for (set<int>::iterator it = down_in_subtrees.begin();
-       it != down_in_subtrees.end(); ++it) {
-      down_in_subtree_names.insert(osdmap.crush->get_item_name(*it));
+    // calculate the number of down osds in each down subtree and store it in num_osds_subtree
+    for (int type = 1; type <= max_type; type++) {
+      for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
+       if (type == 1) {
+          list<int> children;
+          int num = osdmap.crush->get_children(*j, &children);
+          num_osds_subtree[*j] = num;
+        } else {
+          list<int> children;
+          int num = 0;
+          int num_children = osdmap.crush->get_children(*j, &children);
+          if (num_children == 0)
+           continue;
+          for (auto l = children.begin(); l != children.end(); ++l) {
+            if (num_osds_subtree[*l] > 0) {
+              num = num + num_osds_subtree[*l];
+            }
+          }
+          num_osds_subtree[*j] = num;
+       }
+      }
     }
+    num_down_in_osds = down_in_osds.size();
     assert(num_down_in_osds <= num_in_osds);
-    assert(num_down_in_subtrees <= num_in_subtrees);
     if (num_down_in_osds > 0) {
       ostringstream ss;
-      ss << num_down_in_osds << "/" << num_in_osds << " in osds are down";
+      ss << "\n";
+      // summary of down subtree types and osds
+      for (int type = max_type; type > 0; type--) {
+       if (subtree_type_down[type].size() > 0) {
+         ss << subtree_type_down[type].size() << " " << osdmap.crush->get_type_name(type);
+         if (subtree_type_down[type].size() > 1) {
+           ss << "s";
+         }
+         int sum_down_osds = 0;
+         for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
+           sum_down_osds = sum_down_osds + num_osds_subtree[*j];
+         }
+          ss << " (" << sum_down_osds << " osds) down\n";
+       }
+      }
+      ss << down_in_osds.size() << " osds are down\n";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (num_down_in_subtrees > 0) {
-       ostringstream sst;
-        if (num_in_subtrees == 1) {
-          sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit <<
-                " is down";
-         sst << "(" << down_in_subtree_names << ")";
-         summary.push_back(make_pair(HEALTH_WARN, sst.str()));
-        } else {
-          sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit <<
-                "s are down";
-         sst << "(" << down_in_subtree_names << ")";
-         summary.push_back(make_pair(HEALTH_WARN, sst.str()));
+
+      if (detail) {
+       ostringstream ss;
+       // details of down subtree types
+       for (int type = max_type; type > 0; type--) {
+         for (auto j = subtree_type_down[type].rbegin(); j != subtree_type_down[type].rend(); ++j) {
+           ss << osdmap.crush->get_type_name(type);
+           ss << " ";
+           ss << osdmap.crush->get_item_name(*j);
+           // at the top level, do not print location
+           if (type != max_type) {
+              ss << " (";
+              ss << osdmap.crush->get_full_location_ordered_string(*j);
+              ss << ")";
+           }
+           int num = num_osds_subtree[*j];
+           ss << " (" << num << " osds)";
+           ss << " is down\n";
+         }
         }
+       // details of down osds
+       for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
+         ss << "osd." << *it << " (";
+         ss << osdmap.crush->get_full_location_ordered_string(*it);
+          ss << ") is down\n";
+       }
+        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
       }
     }
 
index d5229d3faee01797f906e5346f054974285e30a5..a82c19e054a50afb53e23bc0433fa1574851e2f8 100644 (file)
@@ -293,32 +293,36 @@ bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_ty
   }
 }
 
-int OSDMap::get_parent_subtree_id(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
+bool OSDMap::subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
+                                           set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const
 {
-  set<int> local_down_cache;
-  if (!down_cache) {
-    down_cache = &local_down_cache;
-  }
-
-  int current = id;
-  while (true) {
-    int type;
-    if (current >= 0) {
-      type = 0;
+  if (id >= 0) {
+    bool is_down_ret = is_down(id);
+    if (is_down_ret) {
+      down_in_osds->insert(id);
     } else {
-      type = crush->get_bucket_type(current);
+      up_in_osds->insert(id);
     }
-    assert(type >= 0);
+    return is_down_ret;
+  }
 
-    if (type >= subtree_type) {
-      return current;
-    }
+  if (subtree_type_down &&
+      (*subtree_type_down)[subtree_type].count(id)) {
+    return true;
+  }
 
-    int r = crush->get_immediate_parent_id(current, &current);
-    if (r < 0) {
-      return -ENOENT;
+  list<int> children;
+  crush->get_children(id, &children);
+  for (const auto &child : children) {
+    if (!subtree_type_is_down(cct, child, crush->get_bucket_type(child), down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
+      subtree_up->insert(id);
+      return false;
     }
   }
+  if (subtree_type_down) {
+    (*subtree_type_down)[subtree_type].insert(id);
+  }
+  return true;
 }
 
 void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
index d430c4950f81c63f7cb9204c30078ed70ecab64d..25f274a381dcea9563adaaafa1f85141bae7a33d 100644 (file)
@@ -495,10 +495,9 @@ public:
   bool subtree_is_down(int id, set<int> *down_cache) const;
   bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
   
-  /**
-   * get the id of the parent subtree
-   */
-  int get_parent_subtree_id(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
+  bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
+                            set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const;
+
   int identify_osd(const entity_addr_t& addr) const;
   int identify_osd(const uuid_d& u) const;
   int identify_osd_on_all_channels(const entity_addr_t& addr) const;