From 7cacb701ad4a7281089a07c64196508fcb603acc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Wed, 7 Jun 2017 12:16:58 -0400
Subject: [PATCH] mon/PGMap: call requests blocked for 128x as long ERR not
 WARN

- rename the option (max -> warn)
- add an err_..._ratio multiplier
- switch to HEALTH_ERR once requests are blocked long enough
- make the error ratio high (default is 32*128s -> about an hour) so that
we don't trigger on a heavily loaded cluster.

Signed-off-by: Sage Weil <sage@redhat.com>
---
 PendingReleaseNotes      | 10 ++++++
 src/common/config_opts.h |  3 +-
 src/mon/PGMap.cc         | 76 ++++++++++++++++++++++++++++------------
 3 files changed, 65 insertions(+), 24 deletions(-)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 6ef3e724586..ec4dda767f7 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -137,3 +137,13 @@
   to zero will effectively disable the health check.
 * The "ceph mds tell ..." command has been removed.  It is superceded
   by "ceph tell mds.<id> ..."
+
+12.1.0
+------
+
+* The ``mon_osd_max_op_age`` option has been renamed to
+  ``mon_osd_warn_op_age`` (default: 32 seconds), to indicate we
+  generate a warning at this age.  There is also a new
+  ``mon_osd_err_op_age_ratio`` that is a expressed as a multitple of
+  ``mon_osd_warn_op_age`` (default: 128, for roughly 60 minutes) to
+  control when an error is generated.
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 37810a475a3..41cc211c812 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -288,7 +288,8 @@ OPTION(mon_osd_down_out_interval, OPT_INT, 600) // seconds
 OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack")   // smallest crush unit/type that we will not automatically mark out
 OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3)    // min osds required to be up to mark things down
 OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .75)   // min osds required to be in to mark things out
-OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32)     // max op age before we get concerned (make it a power of 2)
+OPTION(mon_osd_warn_op_age, OPT_DOUBLE, 32)     // max op age before we generate a warning (make it a power of 2)
+OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE, 128)  // when to generate an error, as multiple of mon_osd_warn_op_age
 OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
 OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false)  // allow primary_temp to be set in the osdmap
 OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false)  // allow primary_affinity to be set in the osdmap
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 5eddb50200a..a623e20b2ea 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -2431,7 +2431,7 @@ static void note_stuck_detail(
   }
 }
 
-static int _warn_slow_request_histogram(
+static pair<int,int> _warn_slow_request_histogram(
   CephContext *cct,
   const pow2_hist_t& h,
   string suffix,
@@ -2439,23 +2439,31 @@ static int _warn_slow_request_histogram(
   list<pair<health_status_t,string> > *detail)
 {
   if (h.h.empty())
-    return 0;
+    return make_pair(0, 0);
 
-  unsigned sum = 0;
+  unsigned warn = 0, error = 0;
+  float err_age =
+    cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
   for (unsigned i = h.h.size() - 1; i > 0; --i) {
     float ub = (float)(1 << i) / 1000.0;
-    if (ub < cct->_conf->mon_osd_max_op_age)
+    if (ub < cct->_conf->mon_osd_warn_op_age)
       break;
     if (h.h[i]) {
+      auto sev = HEALTH_WARN;
+      if (ub > err_age) {
+	sev = HEALTH_ERR;
+	error += h.h[i];
+      } else {
+	warn += h.h[i];
+      }
       if (detail) {
 	ostringstream ss;
 	ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
-	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+	detail->push_back(make_pair(sev, ss.str()));
       }
-      sum += h.h[i];
     }
   }
-  return sum;
+  return make_pair(warn, error);
 }
 
 namespace {
@@ -2708,33 +2716,55 @@ void PGMap::get_health(
   }
 
   // slow requests
-  if (cct->_conf->mon_osd_max_op_age > 0 &&
-      osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_max_op_age) {
-    unsigned sum = _warn_slow_request_histogram(
+  if (cct->_conf->mon_osd_warn_op_age > 0 &&
+      osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_warn_op_age) {
+    auto sum = _warn_slow_request_histogram(
       cct, osd_sum.op_queue_age_hist, "", summary, NULL);
-    if (sum > 0) {
-      ostringstream ss;
-      ss << sum << " requests are blocked > " << cct->_conf->mon_osd_max_op_age
-	 << " sec";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+    if (sum.first > 0 || sum.second > 0) {
+      if (sum.first > 0) {
+	ostringstream ss;
+	ss << sum.first << " requests are blocked > "
+	   << cct->_conf->mon_osd_warn_op_age
+	   << " sec";
+	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+      if (sum.second > 0) {
+	ostringstream ss;
+	ss << sum.first << " requests are blocked > "
+	   << (cct->_conf->mon_osd_warn_op_age *
+	       cct->_conf->mon_osd_err_op_age_ratio)
+	   << " sec";
+	summary.push_back(make_pair(HEALTH_ERR, ss.str()));
+      }
 
       if (detail) {
-	unsigned num_slow_osds = 0;
+	unsigned num_warn = 0, num_err = 0;
 	// do per-osd warnings
 	for (auto p = osd_stat.begin();
 	     p != osd_stat.end();
 	     ++p) {
-	  if (_warn_slow_request_histogram(
+	  auto sum = _warn_slow_request_histogram(
 		cct,
 		p->second.op_queue_age_hist,
 		string(" on osd.") + stringify(p->first),
-		summary, detail))
-	    ++num_slow_osds;
+		summary, detail);
+	  if (sum.second)
+	    ++num_err;
+	  else if (sum.first)
+	    ++num_warn;
+	}
+	if (num_err) {
+	  ostringstream ss2;
+	  ss2 << num_err << " osds have very slow requests";
+	  summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
+	  detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
+	}
+	if (num_warn) {
+	  ostringstream ss2;
+	  ss2 << num_err << " osds have slow requests";
+	  summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
+	  detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
 	}
-	ostringstream ss2;
-	ss2 << num_slow_osds << " osds have slow requests";
-	summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
-	detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
       }
     }
   }
-- 
2.39.5