From b9cdb9fa7bef1bb4b93712293fddac3f1c52b26e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 20 Jun 2017 12:44:18 -0400 Subject: [PATCH] mon/MgrMonitor: do not issue MGR_DOWN on new cluster It is normal for the initial cluster to lack a mgr. Wait for some grace period before complaining about a missing mgr. Default to 30m. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/mon/MgrMonitor.cc | 44 ++++++++++++++++++++++++++++++++++------ src/mon/MgrMonitor.h | 3 +++ 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a684e536f9e5d..c488fc981f7e3 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1755,6 +1755,7 @@ OPTION(mgr_service_beacon_grace, OPT_DOUBLE, 60.0) OPTION(mon_mgr_digest_period, OPT_INT, 5) // How frequently to send digests OPTION(mon_mgr_beacon_grace, OPT_INT, 30) // How long to wait to failover OPTION(mon_mgr_inactive_grace, OPT_INT, 60) // How long before health WARN -> ERR +OPTION(mon_mgr_mkfs_grace, OPT_INT, 60) // How long before we complain about MGR_DOWN OPTION(rgw_crypt_require_ssl, OPT_BOOL, true) // requests including encryption key headers must be sent over ssl OPTION(rgw_crypt_default_encryption_key, OPT_STR, "") // base64 encoded key for encryption of rgw objects OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR, "") // extra keys that may be used for aws:kms diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index b01028ef33ea4..8f5c252f2a23f 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -60,6 +60,8 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap) dout(4) << "active server: " << map.active_addr << "(" << map.active_gid << ")" << dendl; + ever_had_active_mgr = get_value("ever_had_active_mgr"); + load_health(); if (map.available) { @@ -81,6 +83,27 @@ void MgrMonitor::create_pending() pending_map.epoch++; } +health_status_t MgrMonitor::should_warn_about_mgr_down() +{ + utime_t now = ceph_clock_now(); + // we warn if + // - we've ever had an active mgr, or + // - we have osds AND we've exceeded the grace period + // which means a new mon cluster and be HEALTH_OK indefinitely as long as + // no OSDs are ever created. + if (ever_had_active_mgr || + (mon->osdmon()->osdmap.get_num_osds() > 0 && + now > mon->monmap->created + g_conf->mon_mgr_mkfs_grace)) { + health_status_t level = HEALTH_WARN; + if (first_seen_inactive != utime_t() && + now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) { + level = HEALTH_ERR; + } + return level; + } + return HEALTH_OK; +} + void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) { dout(10) << __func__ << " " << pending_map << dendl; @@ -91,13 +114,15 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) health_check_map_t next; if (!pending_map.available) { - health_status_t level = HEALTH_WARN; - utime_t now = ceph_clock_now(); - if (first_seen_inactive != utime_t() && - now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) { - level = HEALTH_ERR; + auto level = should_warn_about_mgr_down(); + if (level != HEALTH_OK) { + next.add("MGR_DOWN", level, "no active mgr"); + } else { + dout(10) << __func__ << " no health warning (never active and new cluster)" + << dendl; } - next.add("MGR_DOWN", level, "no active mgr"); + } else { + put_value(t, "ever_had_active_mgr", 1); } encode_health(next, t); } @@ -448,6 +473,13 @@ void MgrMonitor::tick() } } + if (!pending_map.available && + should_warn_about_mgr_down() != HEALTH_OK) { + dout(10) << " exceeded mon_mgr_mkfs_grace " << g_conf->mon_mgr_mkfs_grace + << " seconds" << dendl; + propose = true; + } + if (propose) { propose_pending(); } diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h index ea1a0a91a4aca..ca363ddeef6e8 100644 --- a/src/mon/MgrMonitor.h +++ b/src/mon/MgrMonitor.h @@ -22,6 +22,7 @@ class MgrMonitor: public PaxosService { MgrMap map; MgrMap pending_map; + bool ever_had_active_mgr = false; utime_t first_seen_inactive; @@ -42,6 +43,8 @@ class MgrMonitor: public PaxosService bool check_caps(MonOpRequestRef op, const uuid_d& fsid); + health_status_t should_warn_about_mgr_down(); + public: MgrMonitor(Monitor *mn, Paxos *p, const string& service_name) : PaxosService(mn, p, service_name) -- 2.39.5