]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: refactor health, include optional detail
authorSage Weil <sage@newdream.net>
Wed, 7 Mar 2012 01:05:22 +0000 (17:05 -0800)
committerSage Weil <sage@newdream.net>
Wed, 7 Mar 2012 01:05:22 +0000 (17:05 -0800)
'ceph health' to get the usual summary, 'ceph health detail' to
additionally get a comprehensive list of problems found.

Eventually we can format this as yaml, json, whatever, too.

Signed-off-by: Sage Weil <sage@newdream.net>
13 files changed:
src/mds/MDSMap.cc
src/mds/MDSMap.h
src/mon/MDSMonitor.cc
src/mon/MDSMonitor.h
src/mon/MonMap.h
src/mon/Monitor.cc
src/mon/MonmapMonitor.cc
src/mon/MonmapMonitor.h
src/mon/OSDMonitor.cc
src/mon/OSDMonitor.h
src/mon/PGMonitor.cc
src/mon/PGMonitor.h
src/mon/PaxosService.h

index df44a291b006e69fb8ac0904d24ee6860f188bf6..894a1521195ee65a5f3767430ba53ab27a196d89 100644 (file)
@@ -206,40 +206,54 @@ void MDSMap::print_summary(ostream& out)
   //out << ", " << stopped.size() << " stopped";
 }
 
-enum health_status_t MDSMap::
-get_health(std::ostream &ss) const
+enum health_status_t MDSMap::get_health(list<string>& summary, list<string> *detail) const
 {
   health_status_t ret(HEALTH_OK);
-  std::ostringstream oss;
 
   if (!failed.empty()) {
-    oss << " There are failed MDSes: ";
-    string sep("");
-    for (set<int32_t>::const_iterator f = failed.begin();
-        f != failed.end(); ++f) {
-      oss << sep << "rank " << *f;
-      sep = ", ";
-    }
-    oss << ".";
+    std::ostringstream oss;
+    oss << "mds rank"
+       << ((failed.size() > 1) ? "s ":" ")
+       << failed
+       << ((failed.size() > 1) ? " have":" has")
+       << " failed";
     if (ret > HEALTH_ERR)
       ret = HEALTH_ERR;
+    summary.push_back(oss.str());
+    if (detail) {
+      for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p) {
+       std::ostringstream oss;
+       oss << "mds." << *p << " has failed";
+       detail->push_back(oss.str());
+      }
+    }
   }
 
   map<int32_t,uint64_t>::const_iterator u = up.begin();
   map<int32_t,uint64_t>::const_iterator u_end = up.end();
   map<uint64_t,mds_info_t>::const_iterator m_end = mds_info.end();
-  string prefix(" There are lagging MDSes: ");
+  set<string> laggy;
   for (; u != u_end; ++u) {
     map<uint64_t,mds_info_t>::const_iterator m = mds_info.find(u->second);
     assert(m != m_end);
     const mds_info_t &mds_info(m->second);
     if (mds_info.laggy()) {
-      oss << prefix << mds_info.name << "(rank " << mds_info.rank << ")" ;
-      prefix = ", ";
-      if (ret > HEALTH_WARN)
-       ret = HEALTH_WARN;
+      laggy.insert(mds_info.name);
+      if (detail) {
+       std::ostringstream oss;
+       oss << "mds." << mds_info.name << " at " << mds_info.addr << " is laggy/unresponsive";
+       detail->push_back(oss.str());
+      }
     }
   }
-  ss << oss.str();
+  if (laggy.size()) {
+    std::ostringstream oss;
+    oss << "mds " << laggy
+       << ((laggy.size() > 1) ? " are":" is")
+       << " laggy";
+    summary.push_back(oss.str());
+    if (ret > HEALTH_WARN)
+      ret = HEALTH_WARN;
+  }
   return ret;
 }
index fdb7f2cd66b835fbbc8658fd1675e47ba69971d7..eb451e49b10bfc405ee69bd2f47b29f402498365 100644 (file)
@@ -385,7 +385,7 @@ public:
       return find_unused_for(mds, name);
   }
 
-  enum health_status_t get_health(std::ostream &ss) const;
+  enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
 
   // mds states
   bool is_down(int m) { return up.count(m) == 0; }
index 6e91e608582ae26822b12c4b22523597973d2517..c330a8f9174c8352a78da4518aca53e51ce26e25 100644 (file)
@@ -491,9 +491,9 @@ void MDSMonitor::on_active()
   update_logger();
 }
 
-enum health_status_t MDSMonitor::get_health(ostream &oss) const
+enum health_status_t MDSMonitor::get_health(list<string>& summary, list<string> *detail) const
 {
-  return mdsmap.get_health(oss);
+  return mdsmap.get_health(summary, detail);
 }
 
 bool MDSMonitor::preprocess_command(MMonCommand *m)
index cf869aef5eb089501757f285236f0f3f9c2ebb9f..7e0f023608afa8d592744fe06356c7d5c2cca7f9 100644 (file)
@@ -88,7 +88,7 @@ class MDSMonitor : public PaxosService {
   bool preprocess_offload_targets(MMDSLoadTargets *m);
   bool prepare_offload_targets(MMDSLoadTargets *m);
 
-  enum health_status_t get_health(std::ostream &ss) const;
+  enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
   int fail_mds(std::ostream &ss, const std::string &arg);
   int cluster_fail(std::ostream &ss);
 
index 44f91ad232745ea8161bd3987bd7a80234fe4621..10c3f137dd9385aa5d5cf98232b05cb0242ceb28 100644 (file)
@@ -116,6 +116,11 @@ class MonMap {
     return false;
   }
 
+  string get_name(unsigned n) const {
+    assert(n < rank_name.size());
+    return rank_name[n];
+  }
+
   int get_rank(const string& n) {
     for (unsigned i=0; i<rank_name.size(); i++)
       if (rank_name[i] == n)
index bfeeab9e4b378e472ee0fd53b50157e32ebb7e17..324d045ca741523c5b08805660ad8c2f956898d4 100644 (file)
@@ -1028,28 +1028,35 @@ void Monitor::handle_command(MMonCommand *m)
     }
     if (m->cmd[0] == "health") {
       health_status_t overall = HEALTH_OK;
-      string combined;
+      list<string> summary;
+      list<string> detail;
       for (vector<PaxosService*>::iterator p = paxos_service.begin();
           p != paxos_service.end();
           p++) {
        PaxosService *s = *p;
        ostringstream oss;
-       health_status_t ret = s->get_health(oss);
+       health_status_t ret = s->get_health(summary, (m->cmd.size() > 1) ? &detail : NULL);
        if (ret < overall)
          overall = ret;
-       string cur = oss.str();
-       if (cur.length()) {
-         if (combined.length())
-           combined += "; ";
-         combined += cur;
-       }
       }
       
       stringstream ss;
       ss << overall;
-      if (combined.length())
-       ss << " " << combined;
+      if (!summary.empty()) {
+       ss << ' ';
+       while (!summary.empty()) {
+         ss << summary.front();
+         summary.pop_front();
+         if (!summary.empty())
+           ss << "; ";
+       }
+      }
       rs = ss.str();
+      while (!detail.empty()) {
+       rdata.append(detail.front());
+       rdata.append('\n');
+       detail.pop_front();
+      }
       r = 0;
     }
     if (m->cmd[0] == "heap") {
index 1ef58906c8295a162ebb66fce2184e0704125d7a..3c79178d7a84961100822e58fc30a730a940291a 100644 (file)
@@ -357,7 +357,7 @@ void MonmapMonitor::tick()
   update_from_paxos();
 }
 
-enum health_status_t MonmapMonitor::get_health(std::ostream &ss) const
+enum health_status_t MonmapMonitor::get_health(list<string>& summary, list<string> *detail) const
 {
   enum health_status_t ret(HEALTH_OK);
   
@@ -365,7 +365,21 @@ enum health_status_t MonmapMonitor::get_health(std::ostream &ss) const
   int actual = mon->get_quorum().size();
   if (actual < max) {
     ret = HEALTH_WARN;
+    ostringstream ss;
     ss << (max-actual) << " mons down, quorum " << mon->get_quorum();
+    summary.push_back(ss.str());
+    if (detail) {
+      set<int> q = mon->get_quorum();
+      for (int i=0; i<max; i++) {
+       if (q.count(i) == 0) {
+         ostringstream ss;
+         ss << "mon." << mon->monmap->get_name(i) << " (rank " << i
+            << ") addr " << mon->monmap->get_addr(i)
+            << " is down (out of quorum)";
+         detail->push_back(ss.str());
+       }
+      }
+    }
   }
 
   return ret;
index 2f0d01fa5343d787f5e0614a0c2b2d103e925968..35bec26c5b256dfc47de6791ee9e124d29488563 100644 (file)
@@ -61,7 +61,7 @@ class MonmapMonitor : public PaxosService {
   bool preprocess_command(MMonCommand *m);
   bool prepare_command(MMonCommand *m);
 
-  enum health_status_t get_health(std::ostream &ss) const;
+  enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
 
   /*
    * Since monitors are pretty
index cc2eddc9c59921ec3250e4f176f832fcba7e11cd..3e3924f3e00538b192510ec39aba0d1fb33b33a3 100644 (file)
@@ -1234,7 +1234,7 @@ void OSDMonitor::mark_all_down()
   propose_pending();
 }
 
-enum health_status_t OSDMonitor::get_health(std::ostream &ss) const
+enum health_status_t OSDMonitor::get_health(list<string>& summary, list<string> *detail) const
 {
   enum health_status_t ret(HEALTH_OK);
 
@@ -1243,12 +1243,26 @@ enum health_status_t OSDMonitor::get_health(std::ostream &ss) const
   int num_in_osds = osdmap.get_num_in_osds();
 
   if (num_osds == 0) {
-    ss << "no osds";
+    summary.push_back("no osds");
     ret = HEALTH_ERR;
   } else {
     if (num_up_osds < num_in_osds) {
+      ostringstream ss;
       ss << (num_in_osds - num_up_osds) << "/" << num_in_osds << " in osds are down";
+      summary.push_back(ss.str());
       ret = HEALTH_WARN;
+
+      if (detail) {
+       for (int i = 0; i < osdmap.get_max_osd(); i++) {
+         if (osdmap.exists(i) && !osdmap.is_up(i)) {
+           const osd_info_t& info = osdmap.get_info(i);
+           ostringstream ss;
+           ss << "osd." << i << " is down since epoch " << info.down_at
+              << ", last address " << osdmap.get_addr(i);
+           detail->push_back(ss.str());
+         }
+       }
+      }
     }
   }
   return ret;
index f3b42b0b33ae177977d5e53f26624c18c117da89..e6b36159347d5e967c731ae692757abc515c0a76 100644 (file)
@@ -163,7 +163,7 @@ private:
 
   void tick();  // check state, take actions
 
-  enum health_status_t get_health(std::ostream &ss) const;
+  enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
   bool preprocess_command(MMonCommand *m);
   bool prepare_command(MMonCommand *m);
 
index 69cf6d1e4efef80e3f806da256c44b6d46350665..dad062cd20a864ca1327b01206570db9f4b2e6b1 100644 (file)
@@ -1129,11 +1129,11 @@ bool PGMonitor::prepare_command(MMonCommand *m)
   return false;
 }
 
-enum health_status_t PGMonitor::get_health(std::ostream &ss) const
+enum health_status_t PGMonitor::get_health(list<string>& summary, list<string> *detail) const
 {
   enum health_status_t ret(HEALTH_OK);
-  map<string,int> note;
 
+  map<string,int> note;
   hash_map<int,int>::const_iterator p = pg_map.num_pg_by_state.begin();
   hash_map<int,int>::const_iterator p_end = pg_map.num_pg_by_state.end();
   for (; p != p_end; ++p) {
@@ -1180,35 +1180,70 @@ enum health_status_t PGMonitor::get_health(std::ostream &ss) const
     note["stuck stale"] = stuck_pgs.size();
   }
 
+  if (detail) {
+    for (hash_map<pg_t,pg_stat_t>::iterator p = stuck_pgs.begin();
+        p != stuck_pgs.end();
+        ++p) {
+      ostringstream ss;
+      ss << "pg " << p->first << " is stuck " << pg_state_string(p->second.state)
+        << ", last acting " << p->second.acting;
+      detail->push_back(ss.str());
+    }
+  }
+
   if (!note.empty()) {
-    ret = HEALTH_WARN;
+    if (ret > HEALTH_WARN)
+      ret = HEALTH_WARN;
     for (map<string,int>::iterator p = note.begin(); p != note.end(); p++) {
-      if (p != note.begin())
-       ss << ", ";
+      ostringstream ss;
       ss << p->second << " pgs " << p->first;
+      summary.push_back(ss.str());
+    }
+    if (detail) {
+      for (hash_map<pg_t,pg_stat_t>::const_iterator p = pg_map.pg_stat.begin();
+          p != pg_map.pg_stat.end();
+          ++p) {
+       if (p->second.state & (PG_STATE_STALE |
+                              PG_STATE_DOWN |
+                              PG_STATE_DEGRADED |
+                              PG_STATE_INCONSISTENT |
+                              PG_STATE_PEERING |
+                              PG_STATE_REPAIR |
+                              PG_STATE_SPLITTING |
+                              PG_STATE_RECOVERING |
+                              PG_STATE_INCOMPLETE |
+                              PG_STATE_BACKFILL) &&
+           stuck_pgs.count(p->first) == 0) {
+         ostringstream ss;
+         ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
+         detail->push_back(ss.str());
+       }
+      }
     }
   }
 
   stringstream rss;
   pg_map.recovery_summary(rss);
   if (!rss.str().empty()) {
-    if (ret != HEALTH_OK)
-      ss << ", ";
     ret = HEALTH_WARN;
-    ss << rss.str();
+    summary.push_back(rss.str());
+    if (detail)
+      detail->push_back(rss.str());    
   }
-
+  
   if (pg_map.nearfull_osds.size() > 0) {
-    if (ret != HEALTH_OK)
-      ss << ", ";
+    ostringstream ss;
     ss << pg_map.nearfull_osds.size() << " near full osd(s)";
-    ret = HEALTH_WARN;
+    summary.push_back(ss.str());
+    if (ret > HEALTH_WARN)
+      ret = HEALTH_WARN;
   }
   if (pg_map.full_osds.size() > 0) {
-    if (ret != HEALTH_OK)
-      ss << ", ";
+    ostringstream ss;
     ss << pg_map.full_osds.size() << " full osd(s)";
-    ret = HEALTH_ERR;
+    summary.push_back(ss.str());
+    if (ret > HEALTH_ERR)
+      ret = HEALTH_ERR;
   }
 
   return ret;
index 3dbbfc64c12f7199956adf0efa1e181af834170a..ea959316910702ebab79c76ac5bc933f026df941 100644 (file)
@@ -136,7 +136,7 @@ public:
 
   void check_osd_map(epoch_t epoch);
 
-  enum health_status_t get_health(std::ostream &ss) const;
+  enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
 
 private:
   // no copying allowed
index 43e92823b7b036330cbcfadbd1c13235662e02c7..415be4874dbe2fe721f82576fed9e3d7526260d4 100644 (file)
@@ -158,7 +158,17 @@ public:
 
   virtual void tick() {}
 
-  virtual enum health_status_t get_health(std::ostream& os) const { return HEALTH_OK; }
+  /**
+   * get health information
+   *
+   * @param summary list of summary strings
+   * @param detail optional list of detailed problem reports; may be NULL
+   * @return HEALTH_OK, HEALTH_WARN, or HEALTH_ERROR
+   */
+  virtual enum health_status_t get_health(list<string>& summary,
+                                         list<string> *detail) const {
+    return HEALTH_OK;
+  }
 
 };