mon: OSDMonitor: HEALTH_WARN on 'mon osd down out interval == 0'

author Joao Eduardo Luis <joao.luis@inktank.com>

Fri, 18 Apr 2014 18:15:52 +0000 (19:15 +0100)

committer Joao Eduardo Luis <joao.luis@inktank.com>

Fri, 18 Apr 2014 18:15:52 +0000 (19:15 +0100)
author Joao Eduardo Luis <joao.luis@inktank.com>
Fri, 18 Apr 2014 18:15:52 +0000 (19:15 +0100)
committer Joao Eduardo Luis <joao.luis@inktank.com>
Fri, 18 Apr 2014 18:15:52 +0000 (19:15 +0100)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 7494e6e9e45c2a0ce0024662a82c907d24c74f40..7b4fb841e17b381af73f3dbc5b34d90164f9a1f6 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -5,3 +5,8 @@ v0.80
  * OSDMap's json-formatted dump changed for keys 'full' and 'nearfull'.
    What was previously being outputted as 'true' or 'false' strings are
    now being outputted 'true' and 'false' booleans according to json syntax.
+
+* HEALTH_WARN on 'mon osd down out interval == 0'. Having this option set
+  to zero on the leader acts much like having the 'noout' flag set.  This
+  warning will only be reported if the monitor getting the 'health' or
+  'status' request has this option set to zero.
diff --git a/src/common/config_opts.h b/src/common/config_opts.h

index 935a4836edf805cee9c5e3e3e0bfca02e9a82cf6..a065a772f9b13021ca87ba64fa99e95ac940ca82 100644 (file)
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -176,6 +176,7 @@ OPTION(mon_osd_report_timeout, OPT_INT, 900)    // grace period before declaring
  OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
  OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
  OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
+OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
  OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
  OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
  OPTION(mon_max_log_epochs, OPT_INT, 500)
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index 2b4a3a4a3d69e6f140ae89095c6d77576acec668..fa3d9cfd5778e70633e65dfd84db2d3cc44ac3f7 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2062,6 +2062,29 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
        }
      }
  
+    // Warn if 'mon_osd_down_out_interval' is set to zero.
+    // Having this option set to zero on the leader acts much like the
+    // 'noout' flag.  It's hard to figure out what's going wrong with clusters
+    // without the 'noout' flag set but acting like that just the same, so
+    // we report a HEALTH_WARN in case this option is set to zero.
+    // This is an ugly hack to get the warning out, but until we find a way
+    // to spread global options throughout the mon cluster and have all mons
+    // using a base set of the same options, we need to work around this sort
+    // of things.
+    // There's also the obvious drawback that if this is set on a single
+    // monitor on a 3-monitor cluster, this warning will only be shown every
+    // third monitor connection.
+    if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
+        g_conf->mon_osd_down_out_interval == 0) {
+      ostringstream ss;
+      ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
+      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      if (detail) {
+        ss << "; this has the same effect as the 'noout' flag";
+        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+    }
+
      get_pools_health(summary, detail);
    }
  }
author	Joao Eduardo Luis <joao.luis@inktank.com>
	Fri, 18 Apr 2014 18:15:52 +0000 (19:15 +0100)
committer	Joao Eduardo Luis <joao.luis@inktank.com>
	Fri, 18 Apr 2014 18:15:52 +0000 (19:15 +0100)
PendingReleaseNotes		patch \| blob \| history
src/common/config_opts.h		patch \| blob \| history
src/mon/OSDMonitor.cc		patch \| blob \| history