From: Brad Hubbard Date: Tue, 3 Apr 2018 07:49:08 +0000 (+1000) Subject: mon/PGMap: Summarise OSDs in blocked/stuck requests X-Git-Tag: v12.2.6~168^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=81fa8a7499a7b688aeaa28fba181033dcf692f54;p=ceph.git mon/PGMap: Summarise OSDs in blocked/stuck requests This fix is luminous specific since this code was changed in Mimic to use the new SLOW_OPS errors. Fixes: https://tracker.ceph.com/issues/23205 Signed-off-by: Brad Hubbard --- diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 504adb53cd94..db19af042359 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -3185,14 +3185,11 @@ void PGMap::get_health_checks( } if (!warn_detail.empty()) { - ostringstream ss; - ss << warn << " slow requests are blocked > " - << cct->_conf->mon_osd_warn_op_age << " sec"; - auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str()); - d.detail.swap(warn_detail); int left = max; + set implicated_osds; for (auto& p : warn_osd_by_max) { ostringstream ss; + implicated_osds.insert(p.second.begin(), p.second.end()); if (p.second.size() > 1) { ss << "osds " << p.second << " have blocked requests > " << p.first << " sec"; @@ -3200,21 +3197,24 @@ void PGMap::get_health_checks( ss << "osd." << *p.second.begin() << " has blocked requests > " << p.first << " sec"; } - d.detail.push_back(ss.str()); + warn_detail.push_back(ss.str()); if (--left == 0) { break; } } + ostringstream ss; + ss << warn << " slow requests are blocked > " + << cct->_conf->mon_osd_warn_op_age << " sec. Implicated osds " + << implicated_osds; + auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str()); + d.detail.swap(warn_detail); } if (!error_detail.empty()) { - ostringstream ss; - ss << error << " stuck requests are blocked > " - << err_age << " sec"; - auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str()); - d.detail.swap(error_detail); int left = max; + set implicated_osds; for (auto& p : error_osd_by_max) { ostringstream ss; + implicated_osds.insert(p.second.begin(), p.second.end()); if (p.second.size() > 1) { ss << "osds " << p.second << " have stuck requests > " << p.first << " sec"; @@ -3222,11 +3222,16 @@ void PGMap::get_health_checks( ss << "osd." << *p.second.begin() << " has stuck requests > " << p.first << " sec"; } - d.detail.push_back(ss.str()); + error_detail.push_back(ss.str()); if (--left == 0) { break; } } + ostringstream ss; + ss << error << " stuck requests are blocked > " + << err_age << " sec. Implicated osds " << implicated_osds; + auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str()); + d.detail.swap(error_detail); } }