]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: Log "ceph health detail" periodically in cluster log 38345/head
authorPrashant D <pdhange@redhat.com>
Fri, 30 Oct 2020 10:40:43 +0000 (06:40 -0400)
committerNathan Cutler <ncutler@suse.com>
Mon, 30 Nov 2020 12:22:17 +0000 (13:22 +0100)
change mon_health_to_clog_interval from 1_hr -> 10_min to
log health summary or detail more frequently.

Fixes: https://tracker.ceph.com/issues/48042
Signed-off-by: Prashant Dhange <pdhange@redhat.com>
(cherry picked from commit f45712c19077c5cf5a9938fc3fd17b64ffe3a4ec)

Conflicts:
PendingReleaseNotes
- next Octopus release is 15.2.8 now

PendingReleaseNotes
qa/tasks/ceph.conf.template
src/common/legacy_config_opts.h
src/common/options.cc
src/mon/Monitor.cc

index c5fb58f63e5ca68cfc02f0d2655176bf1a66cdc7..a1f05a9875a3ffb5b32144fe8fef7edc17a298cb 100644 (file)
@@ -1,4 +1,4 @@
-15.2.6
+15.2.8
 ------
 
 * ceph-volume: The ``lvm batch` subcommand received a major rewrite. This closed
@@ -7,6 +7,10 @@
   Please refer to https://docs.ceph.com/en/latest/ceph-volume/lvm/batch/ for
   more detailed information.
 
+* MON: The cluster log now logs health detail every ``mon_health_to_clog_interval``,
+  which has been changed from 1hr to 10min. Logging of health detail will be
+  skipped if there is no change in health summary since last known.
+
 * The ``ceph df`` command now lists the number of pgs in each pool.
 
 * The ``bluefs_preextend_wal_files`` option has been removed.
index 493eacb32caa3d0fbbb397cb767ac539efa6d352..8619817cf75bd0ee975371c78390d4cc801869e5 100644 (file)
@@ -39,6 +39,7 @@
 
        mon cluster log file level = debug
        debug asserts on shutdown = true
+       mon health detail to clog = false
 
 [osd]
         osd journal size = 100
index e59913b37bfe3018492a3c8c5161553af77e63dd..b8bf695951c7a0d92db67a15765c06cf91199504 100644 (file)
@@ -262,6 +262,7 @@ OPTION(mon_reweight_max_change, OPT_DOUBLE)
 OPTION(mon_health_to_clog, OPT_BOOL)
 OPTION(mon_health_to_clog_interval, OPT_INT)
 OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
+OPTION(mon_health_detail_to_clog, OPT_BOOL)
 OPTION(mon_data_avail_crit, OPT_INT)
 OPTION(mon_data_avail_warn, OPT_INT)
 OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)
index e79be858e95e20998ff159619220feca53c1dfeb..66d4ee9d06dcd4bcf90fb84dcbe7ad6f743ab1d2 100644 (file)
@@ -1861,7 +1861,7 @@ std::vector<Option> get_global_options() {
     .set_description("log monitor health to cluster log"),
 
     Option("mon_health_to_clog_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(1_hr)
+    .set_default(10_min)
     .add_service("mon")
     .set_description("frequency to log monitor health to cluster log")
     .add_see_also("mon_health_to_clog"),
@@ -1871,6 +1871,10 @@ std::vector<Option> get_global_options() {
     .add_service("mon")
     .set_description(""),
 
+    Option("mon_health_detail_to_clog", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("log health detail to cluster log"),
+
     Option("mon_health_max_detail", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(50)
     .add_service("mon")
index eecd2f68c315af2efb9080e33d05d859ddac9489..cbde87cc614d005b5fe962ad3c34c9715d06fe53 100644 (file)
@@ -2772,7 +2772,16 @@ void Monitor::do_health_to_clog(bool force)
       summary == health_status_cache.summary &&
       level == health_status_cache.overall)
     return;
-  clog->health(level) << "overall " << summary;
+
+  if (g_conf()->mon_health_detail_to_clog &&
+      summary != health_status_cache.summary &&
+      level != HEALTH_OK) {
+    string details;
+    level = healthmon()->get_health_status(true, nullptr, &details);
+    clog->health(level) << "Health detail: " << details;
+  } else {
+    clog->health(level) << "overall " << summary;
+  }
   health_status_cache.summary = summary;
   health_status_cache.overall = level;
 }