From 14e0eadec9d80b91c3adcea8ac62fc1b2dd7db80 Mon Sep 17 00:00:00 2001 From: Neha Ojha Date: Mon, 1 May 2017 22:56:39 -0700 Subject: [PATCH] mon: add crush type down health warnings Signed-off-by: Neha Ojha --- src/mon/OSDMonitor.cc | 44 ++++++++++++++++++++++++++++++++++++++++--- src/osd/OSDMap.cc | 28 +++++++++++++++++++++++++++ src/osd/OSDMap.h | 4 ++++ 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 212195ffd1fad..799db74e99d1c 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3460,7 +3460,14 @@ void OSDMonitor::get_health(list >& summary, } else { int num_in_osds = 0; int num_down_in_osds = 0; + int num_in_subtrees = 0; + int num_down_in_subtrees = 0; set osds; + set down_cache; // quick cache of down subtrees + set in_subtrees; + set up_in_subtrees; + set down_in_subtrees; + int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit); for (int i = 0; i < osdmap.get_max_osd(); i++) { if (!osdmap.exists(i)) { if (osdmap.crush->item_exists(i)) { @@ -3471,22 +3478,53 @@ void OSDMonitor::get_health(list >& summary, if (osdmap.is_out(i)) continue; ++num_in_osds; + // get the id of the parent subtree + int subtree_id = osdmap.get_parent_subtree_id(g_ceph_context, i, type, &down_cache); + if (subtree_id != -ENOENT) { + in_subtrees.insert(subtree_id); + } + if (!osdmap.is_up(i)) { ++num_down_in_osds; if (detail) { const osd_info_t& info = osdmap.get_info(i); ostringstream ss; - ss << "osd." << i << " is down since epoch " << info.down_at + ss << "osd." << i << " belonging to " << g_conf->mon_osd_down_out_subtree_limit + << " id " << subtree_id << " is down since epoch " << info.down_at << ", last address " << osdmap.get_addr(i); detail->push_back(make_pair(HEALTH_WARN, ss.str())); } } + else { + // if an osd in a subtree is up, implies subtree is not down + up_in_subtrees.insert(subtree_id); + } } + + set_difference(in_subtrees.begin(), in_subtrees.end(), + up_in_subtrees.begin(), up_in_subtrees.end(), + inserter(down_in_subtrees, down_in_subtrees.end())); + num_in_subtrees = in_subtrees.size(); + num_down_in_subtrees = down_in_subtrees.size(); assert(num_down_in_osds <= num_in_osds); + assert(num_down_in_subtrees <= num_in_subtrees); if (num_down_in_osds > 0) { ostringstream ss; - ss << num_down_in_osds << "/" << num_in_osds << " in osds are down"; - summary.push_back(make_pair(HEALTH_WARN, ss.str())); + ss << num_down_in_osds << "/" << num_in_osds << " in osds are down. "; + if (num_down_in_subtrees == 1) { + ss << num_down_in_subtrees << "/" << num_in_subtrees << " of CRUSH type " << + g_conf->mon_osd_down_out_subtree_limit << " is down. "; + } + else { + ss << num_down_in_subtrees << "/" << num_in_subtrees << " of CRUSH type " << + g_conf->mon_osd_down_out_subtree_limit << " are down. "; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + } + if (detail) { + ss << "CRUSH type " << g_conf->mon_osd_down_out_subtree_limit << " down list: [" << + down_in_subtrees << "]"; + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } } if (!osds.empty()) { diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 9fb9000a0dc3c..d5229d3faee01 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -293,6 +293,34 @@ bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_ty } } +int OSDMap::get_parent_subtree_id(CephContext *cct, int id, int subtree_type, set *down_cache) const +{ + set local_down_cache; + if (!down_cache) { + down_cache = &local_down_cache; + } + + int current = id; + while (true) { + int type; + if (current >= 0) { + type = 0; + } else { + type = crush->get_bucket_type(current); + } + assert(type >= 0); + + if (type >= subtree_type) { + return current; + } + + int r = crush->get_immediate_parent_id(current, ¤t); + if (r < 0) { + return -ENOENT; + } + } +} + void OSDMap::Incremental::encode_client_old(bufferlist& bl) const { __u16 v = 5; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index aa431389dc0bb..d430c4950f81c 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -495,6 +495,10 @@ public: bool subtree_is_down(int id, set *down_cache) const; bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set *down_cache) const; + /** + * get the id of the parent subtree + */ + int get_parent_subtree_id(CephContext *cct, int osd, int subtree_type, set *down_cache) const; int identify_osd(const entity_addr_t& addr) const; int identify_osd(const uuid_d& u) const; int identify_osd_on_all_channels(const entity_addr_t& addr) const; -- 2.39.5