} else {
int num_in_osds = 0;
int num_down_in_osds = 0;
- int num_in_subtrees = 0;
- int num_down_in_subtrees = 0;
set<int> osds;
- set<int> down_cache; // quick cache of down subtrees
- set<int> in_subtrees;
- set<int> up_in_subtrees;
- set<int> down_in_subtrees;
- set<string> down_in_subtree_names;
- int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
+ set<int> down_in_osds;
+ set<int> up_in_osds;
+ set<int> subtree_up;
+ unordered_map<int, set<int> > subtree_type_down;
+ unordered_map<int, int> num_osds_subtree;
+ int max_type = osdmap.crush->get_num_type_names() - 1;
+
for (int i = 0; i < osdmap.get_max_osd(); i++) {
if (!osdmap.exists(i)) {
if (osdmap.crush->item_exists(i)) {
if (osdmap.is_out(i))
continue;
++num_in_osds;
- // get the id of the parent subtree
- int subtree_id = osdmap.get_parent_subtree_id(g_ceph_context, i, type, &down_cache);
- if (subtree_id != -ENOENT) {
- in_subtrees.insert(subtree_id);
- }
-
+ if (down_in_osds.count(i) || up_in_osds.count(i))
+ continue;
if (!osdmap.is_up(i)) {
- ++num_down_in_osds;
- if (detail) {
- const osd_info_t& info = osdmap.get_info(i);
- ostringstream ss;
- map<string, string> loc;
- loc = osdmap.crush->get_full_location(i);
- ss << "osd." << i << loc << " is down since epoch " << info.down_at << ", last address "
- << osdmap.get_addr(i);
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ down_in_osds.insert(i);
+ int parent_id = 0;
+ int current = i;
+ for (int type = 0; type <= max_type; type++) {
+ int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
+ if (r == -ENOENT)
+ break;
+ // break early if this parent is already marked as up
+ if (subtree_up.count(parent_id))
+ break;
+ type = osdmap.crush->get_bucket_type(parent_id);
+ if (!osdmap.subtree_type_is_down(g_ceph_context, parent_id, type, &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
+ break;
+ current = parent_id;
}
- } else {
- // if an osd in a subtree is up, implies subtree is not down
- up_in_subtrees.insert(subtree_id);
}
}
- set_difference(in_subtrees.begin(), in_subtrees.end(),
- up_in_subtrees.begin(), up_in_subtrees.end(),
- inserter(down_in_subtrees, down_in_subtrees.end()));
- num_in_subtrees = in_subtrees.size();
- num_down_in_subtrees = down_in_subtrees.size();
- for (set<int>::iterator it = down_in_subtrees.begin();
- it != down_in_subtrees.end(); ++it) {
- down_in_subtree_names.insert(osdmap.crush->get_item_name(*it));
+ // calculate the number of down osds in each down subtree and store it in num_osds_subtree
+ for (int type = 1; type <= max_type; type++) {
+ for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
+ if (type == 1) {
+ list<int> children;
+ int num = osdmap.crush->get_children(*j, &children);
+ num_osds_subtree[*j] = num;
+ } else {
+ list<int> children;
+ int num = 0;
+ int num_children = osdmap.crush->get_children(*j, &children);
+ if (num_children == 0)
+ continue;
+ for (auto l = children.begin(); l != children.end(); ++l) {
+ if (num_osds_subtree[*l] > 0) {
+ num = num + num_osds_subtree[*l];
+ }
+ }
+ num_osds_subtree[*j] = num;
+ }
+ }
}
+ num_down_in_osds = down_in_osds.size();
assert(num_down_in_osds <= num_in_osds);
- assert(num_down_in_subtrees <= num_in_subtrees);
if (num_down_in_osds > 0) {
ostringstream ss;
- ss << num_down_in_osds << "/" << num_in_osds << " in osds are down";
+ ss << "\n";
+ // summary of down subtree types and osds
+ for (int type = max_type; type > 0; type--) {
+ if (subtree_type_down[type].size() > 0) {
+ ss << subtree_type_down[type].size() << " " << osdmap.crush->get_type_name(type);
+ if (subtree_type_down[type].size() > 1) {
+ ss << "s";
+ }
+ int sum_down_osds = 0;
+ for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
+ sum_down_osds = sum_down_osds + num_osds_subtree[*j];
+ }
+ ss << " (" << sum_down_osds << " osds) down\n";
+ }
+ }
+ ss << down_in_osds.size() << " osds are down\n";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (num_down_in_subtrees > 0) {
- ostringstream sst;
- if (num_in_subtrees == 1) {
- sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit <<
- " is down";
- sst << "(" << down_in_subtree_names << ")";
- summary.push_back(make_pair(HEALTH_WARN, sst.str()));
- } else {
- sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit <<
- "s are down";
- sst << "(" << down_in_subtree_names << ")";
- summary.push_back(make_pair(HEALTH_WARN, sst.str()));
+
+ if (detail) {
+ ostringstream ss;
+ // details of down subtree types
+ for (int type = max_type; type > 0; type--) {
+ for (auto j = subtree_type_down[type].rbegin(); j != subtree_type_down[type].rend(); ++j) {
+ ss << osdmap.crush->get_type_name(type);
+ ss << " ";
+ ss << osdmap.crush->get_item_name(*j);
+ // at the top level, do not print location
+ if (type != max_type) {
+ ss << " (";
+ ss << osdmap.crush->get_full_location_ordered_string(*j);
+ ss << ")";
+ }
+ int num = num_osds_subtree[*j];
+ ss << " (" << num << " osds)";
+ ss << " is down\n";
+ }
}
+ // details of down osds
+ for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
+ ss << "osd." << *it << " (";
+ ss << osdmap.crush->get_full_location_ordered_string(*it);
+ ss << ") is down\n";
+ }
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
}
}
-int OSDMap::get_parent_subtree_id(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
+bool OSDMap::subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
+ set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const
{
- set<int> local_down_cache;
- if (!down_cache) {
- down_cache = &local_down_cache;
- }
-
- int current = id;
- while (true) {
- int type;
- if (current >= 0) {
- type = 0;
+ if (id >= 0) {
+ bool is_down_ret = is_down(id);
+ if (is_down_ret) {
+ down_in_osds->insert(id);
} else {
- type = crush->get_bucket_type(current);
+ up_in_osds->insert(id);
}
- assert(type >= 0);
+ return is_down_ret;
+ }
- if (type >= subtree_type) {
- return current;
- }
+ if (subtree_type_down &&
+ (*subtree_type_down)[subtree_type].count(id)) {
+ return true;
+ }
- int r = crush->get_immediate_parent_id(current, ¤t);
- if (r < 0) {
- return -ENOENT;
+ list<int> children;
+ crush->get_children(id, &children);
+ for (const auto &child : children) {
+ if (!subtree_type_is_down(cct, child, crush->get_bucket_type(child), down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
+ subtree_up->insert(id);
+ return false;
}
}
+ if (subtree_type_down) {
+ (*subtree_type_down)[subtree_type].insert(id);
+ }
+ return true;
}
void OSDMap::Incremental::encode_client_old(bufferlist& bl) const