From 31419d0e0f7e940b1a5fc524d589d2cce81f8c35 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 30 Mar 2017 10:40:23 -0400 Subject: [PATCH] mon/MgrMonitor: health warn/err if no active mgr Start warning once mons are luminous; start erroring once require_luminous is set in osdmap. Allow a grace period for mgr to restart or standby to take over before we turn a warning into an error. Signed-off-by: Sage Weil --- qa/releases/luminous.yaml | 2 ++ src/common/config_opts.h | 1 + src/mon/MgrMonitor.cc | 37 +++++++++++++++++++++++++++++++++++++ src/mon/MgrMonitor.h | 6 ++++++ 4 files changed, 46 insertions(+) diff --git a/qa/releases/luminous.yaml b/qa/releases/luminous.yaml index 943fdeba4ae66..b7ff0dc53b6ba 100644 --- a/qa/releases/luminous.yaml +++ b/qa/releases/luminous.yaml @@ -16,3 +16,5 @@ overrides: conf: mon: mon warn on osd down out interval zero: false + log-whitelist: + - no active mgr diff --git a/src/common/config_opts.h b/src/common/config_opts.h index aa6b16baee2c6..1fcf1c21f09ad 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1648,6 +1648,7 @@ OPTION(mgr_connect_retry_interval, OPT_DOUBLE, 1.0) OPTION(mon_mgr_digest_period, OPT_INT, 5) // How frequently to send digests OPTION(mon_mgr_beacon_grace, OPT_INT, 30) // How long to wait to failover +OPTION(mon_mgr_inactive_grace, OPT_INT, 60) // How long before health WARN -> ERR OPTION(rgw_list_bucket_min_readahead, OPT_INT, 1000) // minimum number of entries to read from rados for bucket listing diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 39a80a466498d..fcef7ca50f814 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -19,6 +19,7 @@ #include "PGMonitor.h" #include "include/stringify.h" #include "mgr/MgrContext.h" +#include "OSDMonitor.h" #include "MgrMonitor.h" @@ -46,6 +47,12 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap) dout(4) << "active server: " << map.active_addr << "(" << map.active_gid << ")" << dendl; + if (map.available) { + first_seen_inactive = utime_t(); + } else { + first_seen_inactive = ceph_clock_now(); + } + check_subs(); } @@ -297,6 +304,36 @@ void MgrMonitor::send_digests() mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_callback); } +void MgrMonitor::get_health( + list >& summary, + list > *detail, + CephContext *cct) const +{ + // start mgr warnings as soon as the mons and osds are all upgraded, + // but before the require_luminous osdmap flag is set. this way the + // user gets some warning before the osd flag is set and mgr is + // actually *required*. + if (!mon->monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_LUMINOUS) || + !HAVE_FEATURE(mon->osdmon()->osdmap.get_up_osd_features(), + SERVER_LUMINOUS)) { + return; + } + + if (!map.available) { + auto level = HEALTH_WARN; + // do not escalate to ERR if they are still upgrading to jewel. + if (mon->osdmon()->osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { + utime_t now = ceph_clock_now(); + if (first_seen_inactive != utime_t() && + now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) { + level = HEALTH_ERR; + } + } + summary.push_back(make_pair(level, "no active mgr")); + } +} + void MgrMonitor::tick() { if (!is_active() || !mon->is_leader()) diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h index 7e5fbce5438d2..b82b6df58a25a 100644 --- a/src/mon/MgrMonitor.h +++ b/src/mon/MgrMonitor.h @@ -22,6 +22,8 @@ class MgrMonitor : public PaxosService MgrMap map; MgrMap pending_map; + utime_t first_seen_inactive; + std::map last_beacon; /** @@ -70,6 +72,10 @@ public: void check_subs(); void send_digests(); + void get_health(list >& summary, + list > *detail, + CephContext *cct) const override; + void tick() override; void print_summary(Formatter *f, std::ostream *ss) const; -- 2.39.5