From: Sage Weil Date: Mon, 16 Jun 2014 23:27:05 +0000 (-0700) Subject: mon/OSDMonitor: make down osd count sensible X-Git-Tag: v0.83~52^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=55a97787088b79356c678a909b2410b3924e7f5b;p=ceph.git mon/OSDMonitor: make down osd count sensible We currently log something like 1/10 in osds are down in the health warning when there are down OSDs, but this is based on a comparison of the number of up vs the number of in osds, and makes no sense when there are up osds that are not in. Instead, count only the number OSDs that are both down and in (relative to the total number of OSDs in) and warn about that. This means that, if a disk fails, and we mark it out, and the cluster fully repairs itself, it will go back to a HEALTH_OK state. I think that is a good thing, and certainly preferable to the current nonsense. If we want to distinguish between down+out OSDs that were failed vs those that have been "acknowledged" by an admin to be dead, we will need to add some additional state (possibly reusing the AUTOOUT flag?), but that will require more discussion. Backport: firefly (maybe) Signed-off-by: Sage Weil --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 46e47cb223d1..b22ab6a89ee7 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2018,29 +2018,33 @@ void OSDMonitor::get_health(list >& summary, list > *detail) const { int num_osds = osdmap.get_num_osds(); - int num_up_osds = osdmap.get_num_up_osds(); - int num_in_osds = osdmap.get_num_in_osds(); if (num_osds == 0) { summary.push_back(make_pair(HEALTH_ERR, "no osds")); } else { - if (num_up_osds < num_in_osds) { - ostringstream ss; - ss << (num_in_osds - num_up_osds) << "/" << num_in_osds << " in osds are down"; - summary.push_back(make_pair(HEALTH_WARN, ss.str())); - - if (detail) { - for (int i = 0; i < osdmap.get_max_osd(); i++) { - if (osdmap.exists(i) && !osdmap.is_up(i)) { - const osd_info_t& info = osdmap.get_info(i); - ostringstream ss; - ss << "osd." << i << " is down since epoch " << info.down_at - << ", last address " << osdmap.get_addr(i); - detail->push_back(make_pair(HEALTH_WARN, ss.str())); - } + int num_in_osds = 0; + int num_down_in_osds = 0; + for (int i = 0; i < osdmap.get_max_osd(); i++) { + if (!osdmap.exists(i) || osdmap.is_out(i)) + continue; + ++num_in_osds; + if (!osdmap.is_up(i)) { + ++num_down_in_osds; + if (detail) { + const osd_info_t& info = osdmap.get_info(i); + ostringstream ss; + ss << "osd." << i << " is down since epoch " << info.down_at + << ", last address " << osdmap.get_addr(i); + detail->push_back(make_pair(HEALTH_WARN, ss.str())); } } } + assert(num_down_in_osds <= num_in_osds); + if (num_down_in_osds > 0) { + ostringstream ss; + ss << num_down_in_osds << "/" << num_in_osds << " in osds are down"; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + } // warn about flags if (osdmap.test_flag(CEPH_OSDMAP_PAUSERD |