//out << ", " << stopped.size() << " stopped";
}
-enum health_status_t MDSMap::
-get_health(std::ostream &ss) const
+enum health_status_t MDSMap::get_health(list<string>& summary, list<string> *detail) const
{
health_status_t ret(HEALTH_OK);
- std::ostringstream oss;
if (!failed.empty()) {
- oss << " There are failed MDSes: ";
- string sep("");
- for (set<int32_t>::const_iterator f = failed.begin();
- f != failed.end(); ++f) {
- oss << sep << "rank " << *f;
- sep = ", ";
- }
- oss << ".";
+ std::ostringstream oss;
+ oss << "mds rank"
+ << ((failed.size() > 1) ? "s ":" ")
+ << failed
+ << ((failed.size() > 1) ? " have":" has")
+ << " failed";
if (ret > HEALTH_ERR)
ret = HEALTH_ERR;
+ summary.push_back(oss.str());
+ if (detail) {
+ for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p) {
+ std::ostringstream oss;
+ oss << "mds." << *p << " has failed";
+ detail->push_back(oss.str());
+ }
+ }
}
map<int32_t,uint64_t>::const_iterator u = up.begin();
map<int32_t,uint64_t>::const_iterator u_end = up.end();
map<uint64_t,mds_info_t>::const_iterator m_end = mds_info.end();
- string prefix(" There are lagging MDSes: ");
+ set<string> laggy;
for (; u != u_end; ++u) {
map<uint64_t,mds_info_t>::const_iterator m = mds_info.find(u->second);
assert(m != m_end);
const mds_info_t &mds_info(m->second);
if (mds_info.laggy()) {
- oss << prefix << mds_info.name << "(rank " << mds_info.rank << ")" ;
- prefix = ", ";
- if (ret > HEALTH_WARN)
- ret = HEALTH_WARN;
+ laggy.insert(mds_info.name);
+ if (detail) {
+ std::ostringstream oss;
+ oss << "mds." << mds_info.name << " at " << mds_info.addr << " is laggy/unresponsive";
+ detail->push_back(oss.str());
+ }
}
}
- ss << oss.str();
+ if (laggy.size()) {
+ std::ostringstream oss;
+ oss << "mds " << laggy
+ << ((laggy.size() > 1) ? " are":" is")
+ << " laggy";
+ summary.push_back(oss.str());
+ if (ret > HEALTH_WARN)
+ ret = HEALTH_WARN;
+ }
return ret;
}
return find_unused_for(mds, name);
}
- enum health_status_t get_health(std::ostream &ss) const;
+ enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
// mds states
bool is_down(int m) { return up.count(m) == 0; }
update_logger();
}
-enum health_status_t MDSMonitor::get_health(ostream &oss) const
+enum health_status_t MDSMonitor::get_health(list<string>& summary, list<string> *detail) const
{
- return mdsmap.get_health(oss);
+ return mdsmap.get_health(summary, detail);
}
bool MDSMonitor::preprocess_command(MMonCommand *m)
bool preprocess_offload_targets(MMDSLoadTargets *m);
bool prepare_offload_targets(MMDSLoadTargets *m);
- enum health_status_t get_health(std::ostream &ss) const;
+ enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
int fail_mds(std::ostream &ss, const std::string &arg);
int cluster_fail(std::ostream &ss);
return false;
}
+ string get_name(unsigned n) const {
+ assert(n < rank_name.size());
+ return rank_name[n];
+ }
+
int get_rank(const string& n) {
for (unsigned i=0; i<rank_name.size(); i++)
if (rank_name[i] == n)
}
if (m->cmd[0] == "health") {
health_status_t overall = HEALTH_OK;
- string combined;
+ list<string> summary;
+ list<string> detail;
for (vector<PaxosService*>::iterator p = paxos_service.begin();
p != paxos_service.end();
p++) {
PaxosService *s = *p;
ostringstream oss;
- health_status_t ret = s->get_health(oss);
+ health_status_t ret = s->get_health(summary, (m->cmd.size() > 1) ? &detail : NULL);
if (ret < overall)
overall = ret;
- string cur = oss.str();
- if (cur.length()) {
- if (combined.length())
- combined += "; ";
- combined += cur;
- }
}
stringstream ss;
ss << overall;
- if (combined.length())
- ss << " " << combined;
+ if (!summary.empty()) {
+ ss << ' ';
+ while (!summary.empty()) {
+ ss << summary.front();
+ summary.pop_front();
+ if (!summary.empty())
+ ss << "; ";
+ }
+ }
rs = ss.str();
+ while (!detail.empty()) {
+ rdata.append(detail.front());
+ rdata.append('\n');
+ detail.pop_front();
+ }
r = 0;
}
if (m->cmd[0] == "heap") {
update_from_paxos();
}
-enum health_status_t MonmapMonitor::get_health(std::ostream &ss) const
+enum health_status_t MonmapMonitor::get_health(list<string>& summary, list<string> *detail) const
{
enum health_status_t ret(HEALTH_OK);
int actual = mon->get_quorum().size();
if (actual < max) {
ret = HEALTH_WARN;
+ ostringstream ss;
ss << (max-actual) << " mons down, quorum " << mon->get_quorum();
+ summary.push_back(ss.str());
+ if (detail) {
+ set<int> q = mon->get_quorum();
+ for (int i=0; i<max; i++) {
+ if (q.count(i) == 0) {
+ ostringstream ss;
+ ss << "mon." << mon->monmap->get_name(i) << " (rank " << i
+ << ") addr " << mon->monmap->get_addr(i)
+ << " is down (out of quorum)";
+ detail->push_back(ss.str());
+ }
+ }
+ }
}
return ret;
bool preprocess_command(MMonCommand *m);
bool prepare_command(MMonCommand *m);
- enum health_status_t get_health(std::ostream &ss) const;
+ enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
/*
* Since monitors are pretty
propose_pending();
}
-enum health_status_t OSDMonitor::get_health(std::ostream &ss) const
+enum health_status_t OSDMonitor::get_health(list<string>& summary, list<string> *detail) const
{
enum health_status_t ret(HEALTH_OK);
int num_in_osds = osdmap.get_num_in_osds();
if (num_osds == 0) {
- ss << "no osds";
+ summary.push_back("no osds");
ret = HEALTH_ERR;
} else {
if (num_up_osds < num_in_osds) {
+ ostringstream ss;
ss << (num_in_osds - num_up_osds) << "/" << num_in_osds << " in osds are down";
+ summary.push_back(ss.str());
ret = HEALTH_WARN;
+
+ if (detail) {
+ for (int i = 0; i < osdmap.get_max_osd(); i++) {
+ if (osdmap.exists(i) && !osdmap.is_up(i)) {
+ const osd_info_t& info = osdmap.get_info(i);
+ ostringstream ss;
+ ss << "osd." << i << " is down since epoch " << info.down_at
+ << ", last address " << osdmap.get_addr(i);
+ detail->push_back(ss.str());
+ }
+ }
+ }
}
}
return ret;
void tick(); // check state, take actions
- enum health_status_t get_health(std::ostream &ss) const;
+ enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
bool preprocess_command(MMonCommand *m);
bool prepare_command(MMonCommand *m);
return false;
}
-enum health_status_t PGMonitor::get_health(std::ostream &ss) const
+enum health_status_t PGMonitor::get_health(list<string>& summary, list<string> *detail) const
{
enum health_status_t ret(HEALTH_OK);
- map<string,int> note;
+ map<string,int> note;
hash_map<int,int>::const_iterator p = pg_map.num_pg_by_state.begin();
hash_map<int,int>::const_iterator p_end = pg_map.num_pg_by_state.end();
for (; p != p_end; ++p) {
note["stuck stale"] = stuck_pgs.size();
}
+ if (detail) {
+ for (hash_map<pg_t,pg_stat_t>::iterator p = stuck_pgs.begin();
+ p != stuck_pgs.end();
+ ++p) {
+ ostringstream ss;
+ ss << "pg " << p->first << " is stuck " << pg_state_string(p->second.state)
+ << ", last acting " << p->second.acting;
+ detail->push_back(ss.str());
+ }
+ }
+
if (!note.empty()) {
- ret = HEALTH_WARN;
+ if (ret > HEALTH_WARN)
+ ret = HEALTH_WARN;
for (map<string,int>::iterator p = note.begin(); p != note.end(); p++) {
- if (p != note.begin())
- ss << ", ";
+ ostringstream ss;
ss << p->second << " pgs " << p->first;
+ summary.push_back(ss.str());
+ }
+ if (detail) {
+ for (hash_map<pg_t,pg_stat_t>::const_iterator p = pg_map.pg_stat.begin();
+ p != pg_map.pg_stat.end();
+ ++p) {
+ if (p->second.state & (PG_STATE_STALE |
+ PG_STATE_DOWN |
+ PG_STATE_DEGRADED |
+ PG_STATE_INCONSISTENT |
+ PG_STATE_PEERING |
+ PG_STATE_REPAIR |
+ PG_STATE_SPLITTING |
+ PG_STATE_RECOVERING |
+ PG_STATE_INCOMPLETE |
+ PG_STATE_BACKFILL) &&
+ stuck_pgs.count(p->first) == 0) {
+ ostringstream ss;
+ ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
+ detail->push_back(ss.str());
+ }
+ }
}
}
stringstream rss;
pg_map.recovery_summary(rss);
if (!rss.str().empty()) {
- if (ret != HEALTH_OK)
- ss << ", ";
ret = HEALTH_WARN;
- ss << rss.str();
+ summary.push_back(rss.str());
+ if (detail)
+ detail->push_back(rss.str());
}
-
+
if (pg_map.nearfull_osds.size() > 0) {
- if (ret != HEALTH_OK)
- ss << ", ";
+ ostringstream ss;
ss << pg_map.nearfull_osds.size() << " near full osd(s)";
- ret = HEALTH_WARN;
+ summary.push_back(ss.str());
+ if (ret > HEALTH_WARN)
+ ret = HEALTH_WARN;
}
if (pg_map.full_osds.size() > 0) {
- if (ret != HEALTH_OK)
- ss << ", ";
+ ostringstream ss;
ss << pg_map.full_osds.size() << " full osd(s)";
- ret = HEALTH_ERR;
+ summary.push_back(ss.str());
+ if (ret > HEALTH_ERR)
+ ret = HEALTH_ERR;
}
return ret;
void check_osd_map(epoch_t epoch);
- enum health_status_t get_health(std::ostream &ss) const;
+ enum health_status_t get_health(list<string>& summary, list<string> *detail) const;
private:
// no copying allowed
virtual void tick() {}
- virtual enum health_status_t get_health(std::ostream& os) const { return HEALTH_OK; }
+ /**
+ * get health information
+ *
+ * @param summary list of summary strings
+ * @param detail optional list of detailed problem reports; may be NULL
+ * @return HEALTH_OK, HEALTH_WARN, or HEALTH_ERROR
+ */
+ virtual enum health_status_t get_health(list<string>& summary,
+ list<string> *detail) const {
+ return HEALTH_OK;
+ }
};