From 6a9895b97a2b2fb533092f294e45fede154a7f82 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Tue, 15 Jan 2019 12:12:39 -0800
Subject: [PATCH] mon: Fix scrub health warning handling and change config to a
 ratio

Make this mon_warn code clearer since it involves 2 values
Code used mon scrub interval instead of pg scrub interval
Rename config values to include _pg_ and ratio to make it more clear
Fix scrub warniing handling use per-pool intervals when specified

Fixes: http://tracker.ceph.com/issues/37264

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 PendingReleaseNotes                    |   5 ++
 doc/rados/operations/health-checks.rst |   8 +-
 src/common/legacy_config_opts.h        |   4 +-
 src/common/options.cc                  |  20 ++---
 src/mon/PGMap.cc                       | 100 ++++++++++++++-----------
 5 files changed, 79 insertions(+), 58 deletions(-)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 39f273544411f..9ff4518af0d4e 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -160,6 +160,11 @@
 * The 'cephfs-data-scan scan_links' now automatically repair inotables and
   snaptable.
 
+* Configuration values mon_warn_not_scrubbed/mon_warn_not_deep_scrubbed have been
+  renamed.  They are now mon_warn_pg_not_scrubbed_ratio/mon_warn_pg_not_deep_scrubbed_ratio
+  respectively.  This is to clarify that these warnings are related to pg scrubbing
+  and are a ratio of the related interval.  These options are now enabled by default.
+
 >=13.1.0
 --------
 
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst
index dbb8f5224be0f..ba8d663cebac3 100644
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -741,8 +741,8 @@ _______________
 
 One or more PGs has not been scrubbed recently.  PGs are normally
 scrubbed every ``mon_scrub_interval`` seconds, and this warning
-triggers when ``mon_warn_not_scrubbed`` such intervals have elapsed
-without a scrub.
+triggers when ``mon_warn_pg_not_scrubbed_ratio`` percentage of interval has elapsed
+without a scrub since it was due.
 
 PGs will not scrub if they are not flagged as *clean*, which may
 happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
@@ -757,8 +757,8 @@ ____________________
 
 One or more PGs has not been deep scrubbed recently.  PGs are normally
 scrubbed every ``osd_deep_mon_scrub_interval`` seconds, and this warning
-triggers when ``mon_warn_not_deep_scrubbed`` such intervals have elapsed
-without a scrub.
+triggers when ``mon_warn_pg_not_deep_scrubbed_ratio`` percentage of interval has elapsed
+without a scrub since it was due.
 
 PGs will not (deep) scrub if they are not flagged as *clean*, which may
 happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
index a8fd2ac4eaf45..4a53082b506c9 100644
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -274,8 +274,8 @@ OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
 OPTION(mon_data_avail_crit, OPT_INT)
 OPTION(mon_data_avail_warn, OPT_INT)
 OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)
-OPTION(mon_warn_not_scrubbed, OPT_INT)
-OPTION(mon_warn_not_deep_scrubbed, OPT_INT)
+OPTION(mon_warn_pg_not_scrubbed_ratio, OPT_FLOAT)
+OPTION(mon_warn_pg_not_deep_scrubbed_ratio, OPT_FLOAT)
 OPTION(mon_scrub_interval, OPT_INT) // once a day
 OPTION(mon_scrub_timeout, OPT_INT) // let's give it 5 minutes; why not.
 OPTION(mon_scrub_max_keys, OPT_INT) // max number of keys to scrub each time
diff --git a/src/common/options.cc b/src/common/options.cc
index dab00da1f1d56..f0e661e86c4ba 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -1715,16 +1715,18 @@ std::vector<Option> get_global_options() {
     .add_service("mon")
     .set_description("issue MON_DISK_BIG health warning when mon database is above this size"),
 
-    Option("mon_warn_not_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(0)
-    .add_service("mon")
-    .set_description("if non-zero, issue PG_NOT_SCRUBBED when PG(s) have not been scrubbed for more than this long beyond the configured mon_scrub_interval (seconds)")
-    .add_see_also("osd_scrub_min_interval"),
+    Option("mon_warn_pg_not_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_min(0)
+    .set_description("Percentage of the scrub max interval past the scrub max interval to warn")
+    .set_long_description("")
+    .add_see_also("osd_scrub_max_interval"),
 
-    Option("mon_warn_not_deep_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(0)
-    .add_service("mon")
-    .set_description("if non-zero, issue PG_NOT_DEEP_SCRUBBED when PG(s) have not been scrubbed for more than this long beyond the configured mon_scrub_interval (seconds)")
+    Option("mon_warn_pg_not_deep_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.75)
+    .set_min(0)
+    .set_description("Percentage of the deep scrub interval past the deep scrub interval to warn")
+    .set_long_description("")
     .add_see_also("osd_deep_scrub_interval"),
 
     Option("mon_scrub_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index b4894c035e8e1..4b8f9e8d1f929 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -2941,24 +2941,28 @@ void PGMap::get_health_checks(
 
   // PG_NOT_SCRUBBED
   // PG_NOT_DEEP_SCRUBBED
-  {
-    if (cct->_conf->mon_warn_not_scrubbed ||
-        cct->_conf->mon_warn_not_deep_scrubbed) {
-      list<string> detail, deep_detail;
-      int detail_max = max, deep_detail_max = max;
-      int detail_more = 0, deep_detail_more = 0;
-      int detail_total = 0, deep_detail_total = 0;
-      const double age = cct->_conf->mon_warn_not_scrubbed +
-        cct->_conf->mon_scrub_interval;
-      utime_t cutoff = now;
-      cutoff -= age;
-      const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
-        cct->_conf->osd_deep_scrub_interval;
-      utime_t deep_cutoff = now;
-      deep_cutoff -= deep_age;
-      for (auto& p : pg_stat) {
-        if (cct->_conf->mon_warn_not_scrubbed &&
-            p.second.last_scrub_stamp < cutoff) {
+  if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
+        cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
+    list<string> detail, deep_detail;
+    int detail_max = max, deep_detail_max = max;
+    int detail_more = 0, deep_detail_more = 0;
+    int detail_total = 0, deep_detail_total = 0;
+    for (auto& p : pg_stat) {
+      int64_t pnum =  p.first.pool();
+      auto pool = osdmap.get_pg_pool(pnum);
+      if (!pool)
+        continue;
+      if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
+        double scrub_max_interval = 0;
+        pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
+        if (scrub_max_interval <= 0) {
+          scrub_max_interval = cct->_conf->osd_scrub_max_interval;
+        }
+        const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
+          scrub_max_interval;
+        utime_t cutoff = now;
+        cutoff -= age;
+        if (p.second.last_scrub_stamp < cutoff) {
           if (detail_max > 0) {
             ostringstream ss;
             ss << "pg " << p.first << " not scrubbed since "
@@ -2970,8 +2974,18 @@ void PGMap::get_health_checks(
           }
           ++detail_total;
         }
-        if (cct->_conf->mon_warn_not_deep_scrubbed &&
-            p.second.last_deep_scrub_stamp < deep_cutoff) {
+      }
+      if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
+        double deep_scrub_interval = 0;
+        pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
+        if (deep_scrub_interval <= 0) {
+          deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+        }
+        double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
+          deep_scrub_interval;
+        utime_t deep_cutoff = now;
+        deep_cutoff -= deep_age;
+        if (p.second.last_deep_scrub_stamp < deep_cutoff) {
           if (deep_detail_max > 0) {
             ostringstream ss;
             ss << "pg " << p.first << " not deep-scrubbed since "
@@ -2982,36 +2996,36 @@ void PGMap::get_health_checks(
             ++deep_detail_more;
           }
           ++deep_detail_total;
-        } 
+        }
       }
-      if (detail_total) {
-        ostringstream ss;
-        ss << detail_total << " pgs not scrubbed for " << age;
-        auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+    }
+    if (detail_total) {
+      ostringstream ss;
+      ss << detail_total << " pgs not scrubbed in time";
+      auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
 
-        if (!detail.empty()) {
-          d.detail.swap(detail);
+      if (!detail.empty()) {
+        d.detail.swap(detail);
 
-          if (detail_more) {
-            ostringstream ss;
-            ss << detail_more << " more pgs... ";
-            d.detail.push_back(ss.str());
-          }
+        if (detail_more) {
+          ostringstream ss;
+          ss << detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
         }
       }
-      if (deep_detail_total) {
-        ostringstream ss;
-        ss << deep_detail_total << " pgs not deep-scrubbed for " << deep_age;
-        auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
+    }
+    if (deep_detail_total) {
+      ostringstream ss;
+      ss << deep_detail_total << " pgs not deep-scrubbed in time";
+      auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
 
-        if (!deep_detail.empty()) {
-          d.detail.swap(deep_detail);
+      if (!deep_detail.empty()) {
+        d.detail.swap(deep_detail);
 
-          if (deep_detail_more) {
-            ostringstream ss;
-            ss << deep_detail_more << " more pgs... ";
-            d.detail.push_back(ss.str());
-          }
+        if (deep_detail_more) {
+          ostringstream ss;
+          ss << deep_detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
         }
       }
     }
-- 
2.39.5