]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: add crush type down health warnings
authorNeha Ojha <nojha@redhat.com>
Tue, 2 May 2017 05:56:39 +0000 (22:56 -0700)
committerNeha Ojha <nojha@redhat.com>
Tue, 2 May 2017 17:14:49 +0000 (10:14 -0700)
Signed-off-by: Neha Ojha <nojha@redhat.com>
src/mon/OSDMonitor.cc
src/osd/OSDMap.cc
src/osd/OSDMap.h

index 212195ffd1fadb2f7e43fab8bfd4428e2154d492..799db74e99d1c9bd0d1a0af8c39114d3aa6d70ad 100644 (file)
@@ -3460,7 +3460,14 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
   } else {
     int num_in_osds = 0;
     int num_down_in_osds = 0;
+    int num_in_subtrees = 0;
+    int num_down_in_subtrees = 0;
     set<int> osds;
+    set<int> down_cache;  // quick cache of down subtrees
+    set<int> in_subtrees;
+    set<int> up_in_subtrees;
+    set<int> down_in_subtrees;
+    int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
     for (int i = 0; i < osdmap.get_max_osd(); i++) {
       if (!osdmap.exists(i)) {
         if (osdmap.crush->item_exists(i)) {
@@ -3471,22 +3478,53 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       if (osdmap.is_out(i))
         continue;
       ++num_in_osds;
+      // get the id of the parent subtree
+      int subtree_id = osdmap.get_parent_subtree_id(g_ceph_context, i, type, &down_cache);
+      if (subtree_id != -ENOENT) {
+        in_subtrees.insert(subtree_id);
+      }
+
       if (!osdmap.is_up(i)) {
        ++num_down_in_osds;
        if (detail) {
          const osd_info_t& info = osdmap.get_info(i);
          ostringstream ss;
-         ss << "osd." << i << " is down since epoch " << info.down_at
+         ss << "osd." << i << " belonging to " << g_conf->mon_osd_down_out_subtree_limit
+            << " id " << subtree_id << " is down since epoch " << info.down_at
             << ", last address " << osdmap.get_addr(i);
          detail->push_back(make_pair(HEALTH_WARN, ss.str()));
        }
       }
+      else {
+       // if an osd in a subtree is up, implies subtree is not down
+        up_in_subtrees.insert(subtree_id);
+      }
     }
+
+    set_difference(in_subtrees.begin(), in_subtrees.end(),
+                  up_in_subtrees.begin(), up_in_subtrees.end(),
+                  inserter(down_in_subtrees, down_in_subtrees.end()));
+    num_in_subtrees = in_subtrees.size();
+    num_down_in_subtrees = down_in_subtrees.size();
     assert(num_down_in_osds <= num_in_osds);
+    assert(num_down_in_subtrees <= num_in_subtrees);
     if (num_down_in_osds > 0) {
       ostringstream ss;
-      ss << num_down_in_osds << "/" << num_in_osds << " in osds are down";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      ss << num_down_in_osds << "/" << num_in_osds << " in osds are down. ";
+      if (num_down_in_subtrees == 1) {
+        ss << num_down_in_subtrees << "/" << num_in_subtrees << " of CRUSH type " <<
+             g_conf->mon_osd_down_out_subtree_limit << " is down. ";
+      }
+      else {
+        ss << num_down_in_subtrees << "/" << num_in_subtrees << " of CRUSH type " <<
+              g_conf->mon_osd_down_out_subtree_limit << " are down. ";
+        summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+      if (detail) {
+       ss << "CRUSH type " << g_conf->mon_osd_down_out_subtree_limit << " down list: [" <<
+              down_in_subtrees << "]";
+        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
     }
 
     if (!osds.empty()) {
index 9fb9000a0dc3cf2bdd3df006c304e4a685db35ac..d5229d3faee01797f906e5346f054974285e30a5 100644 (file)
@@ -293,6 +293,34 @@ bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_ty
   }
 }
 
+int OSDMap::get_parent_subtree_id(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
+{
+  set<int> local_down_cache;
+  if (!down_cache) {
+    down_cache = &local_down_cache;
+  }
+
+  int current = id;
+  while (true) {
+    int type;
+    if (current >= 0) {
+      type = 0;
+    } else {
+      type = crush->get_bucket_type(current);
+    }
+    assert(type >= 0);
+
+    if (type >= subtree_type) {
+      return current;
+    }
+
+    int r = crush->get_immediate_parent_id(current, &current);
+    if (r < 0) {
+      return -ENOENT;
+    }
+  }
+}
+
 void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
 {
   __u16 v = 5;
index aa431389dc0bbd9026982412f594e87e1d8ed94d..d430c4950f81c63f7cb9204c30078ed70ecab64d 100644 (file)
@@ -495,6 +495,10 @@ public:
   bool subtree_is_down(int id, set<int> *down_cache) const;
   bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
   
+  /**
+   * get the id of the parent subtree
+   */
+  int get_parent_subtree_id(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
   int identify_osd(const entity_addr_t& addr) const;
   int identify_osd(const uuid_d& u) const;
   int identify_osd_on_all_channels(const entity_addr_t& addr) const;