From 7cacb701ad4a7281089a07c64196508fcb603acc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 7 Jun 2017 12:16:58 -0400 Subject: [PATCH] mon/PGMap: call requests blocked for 128x as long ERR not WARN - rename the option (max -> warn) - add an err_..._ratio multiplier - switch to HEALTH_ERR once requests are blocked long enough - make the error ratio high (default is 32*128s -> about an hour) so that we don't trigger on a heavily loaded cluster. Signed-off-by: Sage Weil --- PendingReleaseNotes | 10 ++++++ src/common/config_opts.h | 3 +- src/mon/PGMap.cc | 76 ++++++++++++++++++++++++++++------------ 3 files changed, 65 insertions(+), 24 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 6ef3e724586..ec4dda767f7 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -137,3 +137,13 @@ to zero will effectively disable the health check. * The "ceph mds tell ..." command has been removed. It is superceded by "ceph tell mds. ..." + +12.1.0 +------ + +* The ``mon_osd_max_op_age`` option has been renamed to + ``mon_osd_warn_op_age`` (default: 32 seconds), to indicate we + generate a warning at this age. There is also a new + ``mon_osd_err_op_age_ratio`` that is a expressed as a multitple of + ``mon_osd_warn_op_age`` (default: 128, for roughly 60 minutes) to + control when an error is generated. diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 37810a475a3..41cc211c812 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -288,7 +288,8 @@ OPTION(mon_osd_down_out_interval, OPT_INT, 600) // seconds OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .75) // min osds required to be in to mark things out -OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get concerned (make it a power of 2) +OPTION(mon_osd_warn_op_age, OPT_DOUBLE, 32) // max op age before we generate a warning (make it a power of 2) +OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE, 128) // when to generate an error, as multiple of mon_osd_warn_op_age OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be set in the osdmap OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false) // allow primary_affinity to be set in the osdmap diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 5eddb50200a..a623e20b2ea 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2431,7 +2431,7 @@ static void note_stuck_detail( } } -static int _warn_slow_request_histogram( +static pair _warn_slow_request_histogram( CephContext *cct, const pow2_hist_t& h, string suffix, @@ -2439,23 +2439,31 @@ static int _warn_slow_request_histogram( list > *detail) { if (h.h.empty()) - return 0; + return make_pair(0, 0); - unsigned sum = 0; + unsigned warn = 0, error = 0; + float err_age = + cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio; for (unsigned i = h.h.size() - 1; i > 0; --i) { float ub = (float)(1 << i) / 1000.0; - if (ub < cct->_conf->mon_osd_max_op_age) + if (ub < cct->_conf->mon_osd_warn_op_age) break; if (h.h[i]) { + auto sev = HEALTH_WARN; + if (ub > err_age) { + sev = HEALTH_ERR; + error += h.h[i]; + } else { + warn += h.h[i]; + } if (detail) { ostringstream ss; ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix; - detail->push_back(make_pair(HEALTH_WARN, ss.str())); + detail->push_back(make_pair(sev, ss.str())); } - sum += h.h[i]; } } - return sum; + return make_pair(warn, error); } namespace { @@ -2708,33 +2716,55 @@ void PGMap::get_health( } // slow requests - if (cct->_conf->mon_osd_max_op_age > 0 && - osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_max_op_age) { - unsigned sum = _warn_slow_request_histogram( + if (cct->_conf->mon_osd_warn_op_age > 0 && + osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_warn_op_age) { + auto sum = _warn_slow_request_histogram( cct, osd_sum.op_queue_age_hist, "", summary, NULL); - if (sum > 0) { - ostringstream ss; - ss << sum << " requests are blocked > " << cct->_conf->mon_osd_max_op_age - << " sec"; - summary.push_back(make_pair(HEALTH_WARN, ss.str())); + if (sum.first > 0 || sum.second > 0) { + if (sum.first > 0) { + ostringstream ss; + ss << sum.first << " requests are blocked > " + << cct->_conf->mon_osd_warn_op_age + << " sec"; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + } + if (sum.second > 0) { + ostringstream ss; + ss << sum.first << " requests are blocked > " + << (cct->_conf->mon_osd_warn_op_age * + cct->_conf->mon_osd_err_op_age_ratio) + << " sec"; + summary.push_back(make_pair(HEALTH_ERR, ss.str())); + } if (detail) { - unsigned num_slow_osds = 0; + unsigned num_warn = 0, num_err = 0; // do per-osd warnings for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) { - if (_warn_slow_request_histogram( + auto sum = _warn_slow_request_histogram( cct, p->second.op_queue_age_hist, string(" on osd.") + stringify(p->first), - summary, detail)) - ++num_slow_osds; + summary, detail); + if (sum.second) + ++num_err; + else if (sum.first) + ++num_warn; + } + if (num_err) { + ostringstream ss2; + ss2 << num_err << " osds have very slow requests"; + summary.push_back(make_pair(HEALTH_ERR, ss2.str())); + detail->push_back(make_pair(HEALTH_ERR, ss2.str())); + } + if (num_warn) { + ostringstream ss2; + ss2 << num_err << " osds have slow requests"; + summary.push_back(make_pair(HEALTH_WARN, ss2.str())); + detail->push_back(make_pair(HEALTH_WARN, ss2.str())); } - ostringstream ss2; - ss2 << num_slow_osds << " osds have slow requests"; - summary.push_back(make_pair(HEALTH_WARN, ss2.str())); - detail->push_back(make_pair(HEALTH_WARN, ss2.str())); } } } -- 2.39.5