]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: do not log MON_DOWN if monitor uptime is less than threshold
authorPatrick Donnelly <pdonnell@redhat.com>
Mon, 18 Mar 2024 15:03:23 +0000 (11:03 -0400)
committerPatrick Donnelly <pdonnell@redhat.com>
Fri, 22 Mar 2024 16:02:37 +0000 (12:02 -0400)
Fixes: https://tracker.ceph.com/issues/64968
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit 4182362c1195f3f322ae50dc0054518e62fea73a)

src/common/options/mon.yaml.in
src/mon/HealthMonitor.cc
src/mon/Monitor.cc
src/mon/Monitor.h

index ff8813c982f940c38c51597639d1a5ee7fa0e558..379917445c0f07969ae218dd68611c8710d4d3d6 100644 (file)
@@ -55,6 +55,15 @@ options:
   default: 1_min
   services:
   - mon
+- name: mon_down_uptime_grace
+  type: secs
+  level: advanced
+  desc: Period in seconds that the cluster may have a mon down after this (leader) monitor comes up.
+  default: 1_min
+  services:
+  - mon
+  flags:
+  - runtime
 - name: mon_mgr_beacon_grace
   type: secs
   level: advanced
index 4d2303d09fbcf1da510b5e930ac2e071d70a105c..45563f87d3d0403e9a04458fefe05fb09c3a0fe4 100644 (file)
@@ -803,9 +803,14 @@ void HealthMonitor::check_for_mon_down(health_check_map_t *checks)
 {
   int max = mon.monmap->size();
   int actual = mon.get_quorum().size();
-  const auto now = ceph::real_clock::now();
+  const auto rcnow = ceph::real_clock::now();
+  const auto created = mon.monmap->created.to_real_time();
+  const auto mcnow = ceph::coarse_mono_clock::now();
+  const auto starttime = mon.get_starttime();
+
   if (actual < max &&
-      now > mon.monmap->created.to_real_time() + g_conf().get_val<std::chrono::seconds>("mon_down_mkfs_grace")) {
+      (rcnow - created) > g_conf().get_val<std::chrono::seconds>("mon_down_mkfs_grace") &&
+      (mcnow - starttime) > g_conf().get_val<std::chrono::seconds>("mon_down_uptime_grace")) {
     ostringstream ss;
     ss << (max-actual) << "/" << max << " mons down, quorum "
        << mon.get_quorum_names();
index 5543ff316fff27de5bf5207d57b8d18013ee863f..05748ac4ffbc39b8f4f85fabf721a31b38dd68f6 100644 (file)
@@ -2659,6 +2659,7 @@ void Monitor::get_mon_status(Formatter *f)
   f->dump_int("rank", rank);
   f->dump_string("state", get_state_name());
   f->dump_int("election_epoch", get_epoch());
+  f->dump_int("uptime", get_uptime().count());
 
   f->open_array_section("quorum");
   for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) {
index ed1e2a4e8a95dbb135a8ede858245dc547a70ff3..13afacafde7dd6b01ce4450499eb3a121d48eaf3 100644 (file)
@@ -1099,6 +1099,18 @@ public:
   }
 
   bool is_keyring_required();
+
+public:
+  ceph::coarse_mono_time get_starttime() const {
+    return starttime;
+  }
+  std::chrono::milliseconds get_uptime() const {
+    auto now = ceph::coarse_mono_clock::now();
+    return std::chrono::duration_cast<std::chrono::milliseconds>(now-starttime);
+  }
+
+private:
+  ceph::coarse_mono_time const starttime = coarse_mono_clock::now();
 };
 
 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")