From 72682e57ce5691979a20efe93d6e73307bd4f93e Mon Sep 17 00:00:00 2001 From: Neha Ojha Date: Fri, 12 May 2017 14:20:26 -0700 Subject: [PATCH] mon: subtree-based crush type down health warnings Signed-off-by: Neha Ojha --- src/crush/CrushWrapper.cc | 14 ++++ src/crush/CrushWrapper.h | 9 +++ src/mon/OSDMonitor.cc | 135 ++++++++++++++++++++++++-------------- src/osd/OSDMap.cc | 42 ++++++------ src/osd/OSDMap.h | 7 +- 5 files changed, 136 insertions(+), 71 deletions(-) diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 7974b9a75014e..8dd94328f8094 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -587,6 +587,20 @@ int CrushWrapper::get_full_location_ordered(int id, vector return 0; } +string CrushWrapper::get_full_location_ordered_string(int id) +{ + vector > full_location_ordered; + string full_location; + get_full_location_ordered(id, full_location_ordered); + reverse(begin(full_location_ordered), end(full_location_ordered)); + for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) { + full_location = full_location + i->first + "=" + i->second; + if (i != full_location_ordered.end() - 1) { + full_location = full_location + ","; + } + } + return full_location; +} map CrushWrapper::get_parent_hierarchy(int id) { diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 54fd572f8cc41..6ab6dd55fae9f 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -590,6 +590,15 @@ public: */ int get_full_location_ordered(int id, vector >& path); + /* + * identical to get_full_location_ordered(int id, vector >& path), + * although it returns a concatenated string with the type/name pairs in descending + * hierarchical order with format key1=val1,key2=val2. + * + * returns the location in descending hierarchy as a string. + */ + string get_full_location_ordered_string(int id); + /** * returns (type_id, type) of all parent buckets between id and * default, can be used to check for anomolous CRUSH maps diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index a2a9a9c5a638c..a4be13cc20326 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3460,15 +3460,14 @@ void OSDMonitor::get_health(list >& summary, } else { int num_in_osds = 0; int num_down_in_osds = 0; - int num_in_subtrees = 0; - int num_down_in_subtrees = 0; set osds; - set down_cache; // quick cache of down subtrees - set in_subtrees; - set up_in_subtrees; - set down_in_subtrees; - set down_in_subtree_names; - int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit); + set down_in_osds; + set up_in_osds; + set subtree_up; + unordered_map > subtree_type_down; + unordered_map num_osds_subtree; + int max_type = osdmap.crush->get_num_type_names() - 1; + for (int i = 0; i < osdmap.get_max_osd(); i++) { if (!osdmap.exists(i)) { if (osdmap.crush->item_exists(i)) { @@ -3479,57 +3478,97 @@ void OSDMonitor::get_health(list >& summary, if (osdmap.is_out(i)) continue; ++num_in_osds; - // get the id of the parent subtree - int subtree_id = osdmap.get_parent_subtree_id(g_ceph_context, i, type, &down_cache); - if (subtree_id != -ENOENT) { - in_subtrees.insert(subtree_id); - } - + if (down_in_osds.count(i) || up_in_osds.count(i)) + continue; if (!osdmap.is_up(i)) { - ++num_down_in_osds; - if (detail) { - const osd_info_t& info = osdmap.get_info(i); - ostringstream ss; - map loc; - loc = osdmap.crush->get_full_location(i); - ss << "osd." << i << loc << " is down since epoch " << info.down_at << ", last address " - << osdmap.get_addr(i); - detail->push_back(make_pair(HEALTH_WARN, ss.str())); + down_in_osds.insert(i); + int parent_id = 0; + int current = i; + for (int type = 0; type <= max_type; type++) { + int r = osdmap.crush->get_immediate_parent_id(current, &parent_id); + if (r == -ENOENT) + break; + // break early if this parent is already marked as up + if (subtree_up.count(parent_id)) + break; + type = osdmap.crush->get_bucket_type(parent_id); + if (!osdmap.subtree_type_is_down(g_ceph_context, parent_id, type, &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down)) + break; + current = parent_id; } - } else { - // if an osd in a subtree is up, implies subtree is not down - up_in_subtrees.insert(subtree_id); } } - set_difference(in_subtrees.begin(), in_subtrees.end(), - up_in_subtrees.begin(), up_in_subtrees.end(), - inserter(down_in_subtrees, down_in_subtrees.end())); - num_in_subtrees = in_subtrees.size(); - num_down_in_subtrees = down_in_subtrees.size(); - for (set::iterator it = down_in_subtrees.begin(); - it != down_in_subtrees.end(); ++it) { - down_in_subtree_names.insert(osdmap.crush->get_item_name(*it)); + // calculate the number of down osds in each down subtree and store it in num_osds_subtree + for (int type = 1; type <= max_type; type++) { + for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) { + if (type == 1) { + list children; + int num = osdmap.crush->get_children(*j, &children); + num_osds_subtree[*j] = num; + } else { + list children; + int num = 0; + int num_children = osdmap.crush->get_children(*j, &children); + if (num_children == 0) + continue; + for (auto l = children.begin(); l != children.end(); ++l) { + if (num_osds_subtree[*l] > 0) { + num = num + num_osds_subtree[*l]; + } + } + num_osds_subtree[*j] = num; + } + } } + num_down_in_osds = down_in_osds.size(); assert(num_down_in_osds <= num_in_osds); - assert(num_down_in_subtrees <= num_in_subtrees); if (num_down_in_osds > 0) { ostringstream ss; - ss << num_down_in_osds << "/" << num_in_osds << " in osds are down"; + ss << "\n"; + // summary of down subtree types and osds + for (int type = max_type; type > 0; type--) { + if (subtree_type_down[type].size() > 0) { + ss << subtree_type_down[type].size() << " " << osdmap.crush->get_type_name(type); + if (subtree_type_down[type].size() > 1) { + ss << "s"; + } + int sum_down_osds = 0; + for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) { + sum_down_osds = sum_down_osds + num_osds_subtree[*j]; + } + ss << " (" << sum_down_osds << " osds) down\n"; + } + } + ss << down_in_osds.size() << " osds are down\n"; summary.push_back(make_pair(HEALTH_WARN, ss.str())); - if (num_down_in_subtrees > 0) { - ostringstream sst; - if (num_in_subtrees == 1) { - sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit << - " is down"; - sst << "(" << down_in_subtree_names << ")"; - summary.push_back(make_pair(HEALTH_WARN, sst.str())); - } else { - sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit << - "s are down"; - sst << "(" << down_in_subtree_names << ")"; - summary.push_back(make_pair(HEALTH_WARN, sst.str())); + + if (detail) { + ostringstream ss; + // details of down subtree types + for (int type = max_type; type > 0; type--) { + for (auto j = subtree_type_down[type].rbegin(); j != subtree_type_down[type].rend(); ++j) { + ss << osdmap.crush->get_type_name(type); + ss << " "; + ss << osdmap.crush->get_item_name(*j); + // at the top level, do not print location + if (type != max_type) { + ss << " ("; + ss << osdmap.crush->get_full_location_ordered_string(*j); + ss << ")"; + } + int num = num_osds_subtree[*j]; + ss << " (" << num << " osds)"; + ss << " is down\n"; + } } + // details of down osds + for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) { + ss << "osd." << *it << " ("; + ss << osdmap.crush->get_full_location_ordered_string(*it); + ss << ") is down\n"; + } + detail->push_back(make_pair(HEALTH_WARN, ss.str())); } } diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index d5229d3faee01..a82c19e054a50 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -293,32 +293,36 @@ bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_ty } } -int OSDMap::get_parent_subtree_id(CephContext *cct, int id, int subtree_type, set *down_cache) const +bool OSDMap::subtree_type_is_down(CephContext *cct, int id, int subtree_type, set *down_in_osds, set *up_in_osds, + set *subtree_up, unordered_map > *subtree_type_down) const { - set local_down_cache; - if (!down_cache) { - down_cache = &local_down_cache; - } - - int current = id; - while (true) { - int type; - if (current >= 0) { - type = 0; + if (id >= 0) { + bool is_down_ret = is_down(id); + if (is_down_ret) { + down_in_osds->insert(id); } else { - type = crush->get_bucket_type(current); + up_in_osds->insert(id); } - assert(type >= 0); + return is_down_ret; + } - if (type >= subtree_type) { - return current; - } + if (subtree_type_down && + (*subtree_type_down)[subtree_type].count(id)) { + return true; + } - int r = crush->get_immediate_parent_id(current, ¤t); - if (r < 0) { - return -ENOENT; + list children; + crush->get_children(id, &children); + for (const auto &child : children) { + if (!subtree_type_is_down(cct, child, crush->get_bucket_type(child), down_in_osds, up_in_osds, subtree_up, subtree_type_down)) { + subtree_up->insert(id); + return false; } } + if (subtree_type_down) { + (*subtree_type_down)[subtree_type].insert(id); + } + return true; } void OSDMap::Incremental::encode_client_old(bufferlist& bl) const diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index d430c4950f81c..25f274a381dce 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -495,10 +495,9 @@ public: bool subtree_is_down(int id, set *down_cache) const; bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set *down_cache) const; - /** - * get the id of the parent subtree - */ - int get_parent_subtree_id(CephContext *cct, int osd, int subtree_type, set *down_cache) const; + bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set *down_in_osds, set *up_in_osds, + set *subtree_up, unordered_map > *subtree_type_down) const; + int identify_osd(const entity_addr_t& addr) const; int identify_osd(const uuid_d& u) const; int identify_osd_on_all_channels(const entity_addr_t& addr) const; -- 2.39.5