From f1670fdafc4bb8ceb7c6cbbc683ace2c0dd08872 Mon Sep 17 00:00:00 2001
From: Prashant D <pdhange@redhat.com>
Date: Fri, 30 Oct 2020 06:40:43 -0400
Subject: [PATCH] mon: Log "ceph health detail" periodically in cluster log

change mon_health_to_clog_interval from 1_hr -> 10_min to
log health summary or detail more frequently.

No HealthMonitor class in nautilus.

Fixes: https://tracker.ceph.com/issues/48042

Signed-off-by: Prashant Dhange <pdhange@redhat.com>
(cherry picked from commit f45712c19077c5cf5a9938fc3fd17b64ffe3a4ec)

 Conflicts:
	PendingReleaseNotes - add and restructure 14.2.16
---
 PendingReleaseNotes             | 22 +++++++++++++++++++---
 qa/tasks/ceph.conf.template     |  1 +
 src/common/legacy_config_opts.h |  1 +
 src/common/options.cc           |  6 +++++-
 src/mon/Monitor.cc              | 11 ++++++++++-
 5 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 1722a43bfaff..acbe6dcc4297 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,9 +1,25 @@
+14.2.16
+-------
+
+* The structured output of ``ceph status`` or ``ceph -s`` is now more
+  concise, particularly the ``mgrmap`` and ``monmap`` sections, and the
+  structure of the ``osdmap`` section has been cleaned up.
+
+* MON: The cluster log now logs health detail every ``mon_health_to_clog_interval``,
+  which has been changed from 1hr to 10min. Logging of health detail will be
+  skipped if there is no change in health summary since last known.
+
+
 14.2.15
 -------
 
 * MGR: progress module can now be turned on/off, using the commands:
   ``ceph progress on`` and ``ceph progress off``.
 
-* The structured output of ``ceph status`` or ``ceph -s`` is now more
-  concise, particularly the ``mgrmap`` and ``monmap`` sections, and the
-  structure of the ``osdmap`` section has been cleaned up.
+
+14.2.13
+-------
+
+* This release fixes a regression introduced in 14.2.12 which broke deployments
+  that referred to MON hosts using DNS names instead of IP addresses in the
+  ``mon_host`` parameter in ``/etc/ceph/ceph.conf``.
diff --git a/qa/tasks/ceph.conf.template b/qa/tasks/ceph.conf.template
index 5be8c06bd16d..0c0f503abfd0 100644
--- a/qa/tasks/ceph.conf.template
+++ b/qa/tasks/ceph.conf.template
@@ -36,6 +36,7 @@
 
 	mon cluster log file level = debug
 	debug asserts on shutdown = true
+	mon health detail to clog = false
 
 	# we see this fail in qa on *nautilus*; bump up retries
 	mon_client_directed_command_retry = 4
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
index 586b3f6d8e16..d1869cc00318 100644
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -272,6 +272,7 @@ OPTION(mon_reweight_max_change, OPT_DOUBLE)
 OPTION(mon_health_to_clog, OPT_BOOL)
 OPTION(mon_health_to_clog_interval, OPT_INT)
 OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
+OPTION(mon_health_detail_to_clog, OPT_BOOL)
 OPTION(mon_data_avail_crit, OPT_INT)
 OPTION(mon_data_avail_warn, OPT_INT)
 OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)
diff --git a/src/common/options.cc b/src/common/options.cc
index a4bd01a0f18a..df16c9457f74 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -1871,7 +1871,7 @@ std::vector<Option> get_global_options() {
     .set_description("log monitor health to cluster log"),
 
     Option("mon_health_to_clog_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(1_hr)
+    .set_default(10_min)
     .add_service("mon")
     .set_description("frequency to log monitor health to cluster log")
     .add_see_also("mon_health_to_clog"),
@@ -1881,6 +1881,10 @@ std::vector<Option> get_global_options() {
     .add_service("mon")
     .set_description(""),
 
+    Option("mon_health_detail_to_clog", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("log health detail to cluster log"),
+
     Option("mon_health_max_detail", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(50)
     .add_service("mon")
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 793031d21c65..612acbb31a48 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2777,7 +2777,16 @@ void Monitor::do_health_to_clog(bool force)
       summary == health_status_cache.summary &&
       level == health_status_cache.overall)
     return;
-  clog->health(level) << "overall " << summary;
+
+  if (g_conf()->mon_health_detail_to_clog &&
+      summary != health_status_cache.summary &&
+      level != HEALTH_OK) {
+    string details;
+    level = get_health_status(true, nullptr, &details);
+    clog->health(level) << "Health detail: " << details;
+  } else {
+    clog->health(level) << "overall " << summary;
+  }
   health_status_cache.summary = summary;
   health_status_cache.overall = level;
 }
-- 
2.47.3