From 4a32f33ea38d7c9e6f34e7a5ea3b12281f8b5422 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 18 Mar 2024 11:03:23 -0400 Subject: [PATCH] mon: do not log MON_DOWN if monitor uptime is less than threshold Fixes: https://tracker.ceph.com/issues/64968 Signed-off-by: Patrick Donnelly (cherry picked from commit 4182362c1195f3f322ae50dc0054518e62fea73a) --- src/common/options/mon.yaml.in | 9 +++++++++ src/mon/HealthMonitor.cc | 9 +++++++-- src/mon/Monitor.cc | 1 + src/mon/Monitor.h | 12 ++++++++++++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index ff8813c982f..379917445c0 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -55,6 +55,15 @@ options: default: 1_min services: - mon +- name: mon_down_uptime_grace + type: secs + level: advanced + desc: Period in seconds that the cluster may have a mon down after this (leader) monitor comes up. + default: 1_min + services: + - mon + flags: + - runtime - name: mon_mgr_beacon_grace type: secs level: advanced diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 4d2303d09fb..45563f87d3d 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -803,9 +803,14 @@ void HealthMonitor::check_for_mon_down(health_check_map_t *checks) { int max = mon.monmap->size(); int actual = mon.get_quorum().size(); - const auto now = ceph::real_clock::now(); + const auto rcnow = ceph::real_clock::now(); + const auto created = mon.monmap->created.to_real_time(); + const auto mcnow = ceph::coarse_mono_clock::now(); + const auto starttime = mon.get_starttime(); + if (actual < max && - now > mon.monmap->created.to_real_time() + g_conf().get_val("mon_down_mkfs_grace")) { + (rcnow - created) > g_conf().get_val("mon_down_mkfs_grace") && + (mcnow - starttime) > g_conf().get_val("mon_down_uptime_grace")) { ostringstream ss; ss << (max-actual) << "/" << max << " mons down, quorum " << mon.get_quorum_names(); diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 5543ff316ff..05748ac4ffb 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2659,6 +2659,7 @@ void Monitor::get_mon_status(Formatter *f) f->dump_int("rank", rank); f->dump_string("state", get_state_name()); f->dump_int("election_epoch", get_epoch()); + f->dump_int("uptime", get_uptime().count()); f->open_array_section("quorum"); for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) { diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index ed1e2a4e8a9..13afacafde7 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -1099,6 +1099,18 @@ public: } bool is_keyring_required(); + +public: + ceph::coarse_mono_time get_starttime() const { + return starttime; + } + std::chrono::milliseconds get_uptime() const { + auto now = ceph::coarse_mono_clock::now(); + return std::chrono::duration_cast(now-starttime); + } + +private: + ceph::coarse_mono_time const starttime = coarse_mono_clock::now(); }; #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)") -- 2.39.5