From: Sage Weil Date: Thu, 30 Mar 2017 14:40:23 +0000 (-0400) Subject: mon/MgrMonitor: health warn/err if no active mgr X-Git-Tag: v12.0.2~252^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=31419d0e0f7e940b1a5fc524d589d2cce81f8c35;p=ceph.git mon/MgrMonitor: health warn/err if no active mgr Start warning once mons are luminous; start erroring once require_luminous is set in osdmap. Allow a grace period for mgr to restart or standby to take over before we turn a warning into an error. Signed-off-by: Sage Weil --- diff --git a/qa/releases/luminous.yaml b/qa/releases/luminous.yaml index 943fdeba4ae6..b7ff0dc53b6b 100644 --- a/qa/releases/luminous.yaml +++ b/qa/releases/luminous.yaml @@ -16,3 +16,5 @@ overrides: conf: mon: mon warn on osd down out interval zero: false + log-whitelist: + - no active mgr diff --git a/src/common/config_opts.h b/src/common/config_opts.h index aa6b16baee2c..1fcf1c21f09a 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1648,6 +1648,7 @@ OPTION(mgr_connect_retry_interval, OPT_DOUBLE, 1.0) OPTION(mon_mgr_digest_period, OPT_INT, 5) // How frequently to send digests OPTION(mon_mgr_beacon_grace, OPT_INT, 30) // How long to wait to failover +OPTION(mon_mgr_inactive_grace, OPT_INT, 60) // How long before health WARN -> ERR OPTION(rgw_list_bucket_min_readahead, OPT_INT, 1000) // minimum number of entries to read from rados for bucket listing diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 39a80a466498..fcef7ca50f81 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -19,6 +19,7 @@ #include "PGMonitor.h" #include "include/stringify.h" #include "mgr/MgrContext.h" +#include "OSDMonitor.h" #include "MgrMonitor.h" @@ -46,6 +47,12 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap) dout(4) << "active server: " << map.active_addr << "(" << map.active_gid << ")" << dendl; + if (map.available) { + first_seen_inactive = utime_t(); + } else { + first_seen_inactive = ceph_clock_now(); + } + check_subs(); } @@ -297,6 +304,36 @@ void MgrMonitor::send_digests() mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_callback); } +void MgrMonitor::get_health( + list >& summary, + list > *detail, + CephContext *cct) const +{ + // start mgr warnings as soon as the mons and osds are all upgraded, + // but before the require_luminous osdmap flag is set. this way the + // user gets some warning before the osd flag is set and mgr is + // actually *required*. + if (!mon->monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_LUMINOUS) || + !HAVE_FEATURE(mon->osdmon()->osdmap.get_up_osd_features(), + SERVER_LUMINOUS)) { + return; + } + + if (!map.available) { + auto level = HEALTH_WARN; + // do not escalate to ERR if they are still upgrading to jewel. + if (mon->osdmon()->osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { + utime_t now = ceph_clock_now(); + if (first_seen_inactive != utime_t() && + now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) { + level = HEALTH_ERR; + } + } + summary.push_back(make_pair(level, "no active mgr")); + } +} + void MgrMonitor::tick() { if (!is_active() || !mon->is_leader()) diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h index 7e5fbce5438d..b82b6df58a25 100644 --- a/src/mon/MgrMonitor.h +++ b/src/mon/MgrMonitor.h @@ -22,6 +22,8 @@ class MgrMonitor : public PaxosService MgrMap map; MgrMap pending_map; + utime_t first_seen_inactive; + std::map last_beacon; /** @@ -70,6 +72,10 @@ public: void check_subs(); void send_digests(); + void get_health(list >& summary, + list > *detail, + CephContext *cct) const override; + void tick() override; void print_summary(Formatter *f, std::ostream *ss) const;