From b028a41e1f000b87aab3f263ab3259a0ca439555 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 19 Nov 2025 18:16:21 -0500 Subject: [PATCH] mon/HealthMonitor: avoid MON_DOWN for freshly added Monitor In testing, we often have the scenario where cephadm has created a cluster but doesn't add more monitors until well past mon_down_mkfs_grace. This causes useless MON_DOWN warnings to be thrown which fails QA jobs. Avoid this situation entirely by giving a reasonable grace period for a monitor added to the MonMap to join quorum. Fixes: https://tracker.ceph.com/issues/73934 Signed-off-by: Patrick Donnelly --- src/common/options/mon.yaml.in | 9 ++++++++ src/mon/HealthMonitor.cc | 38 ++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 4f9d9b012ff..c52ca49a412 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -62,6 +62,15 @@ options: - mon flags: - runtime +- name: mon_down_added_grace + type: secs + level: advanced + desc: Period in seconds that the cluster may have a newly added mon down + default: 3_min + services: + - mon + flags: + - runtime - name: mon_down_uptime_grace type: secs level: advanced diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index b5fe1ee9485..66faec05c88 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -811,30 +811,38 @@ void HealthMonitor::check_for_mon_down(health_check_map_t *checks, std::setsize(); int actual = mon.get_quorum().size(); + const auto mon_down_mkfs_grace = g_conf().get_val("mon_down_mkfs_grace"); + const auto mon_down_uptime_grace = g_conf().get_val("mon_down_uptime_grace"); + const auto mon_down_added_grace = g_conf().get_val("mon_down_added_grace"); + const auto rcnow = ceph::real_clock::now(); const auto created = mon.monmap->created.to_real_time(); const auto mcnow = ceph::coarse_mono_clock::now(); const auto starttime = mon.get_starttime(); - if (actual < max && - (rcnow - created) > g_conf().get_val("mon_down_mkfs_grace") && - (mcnow - starttime) > g_conf().get_val("mon_down_uptime_grace")) { - ostringstream ss; - ss << (max-actual) << "/" << max << " mons down, quorum " - << mon.get_quorum_names(); - auto& d = checks->add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual); - set q = mon.get_quorum(); + if (actual < max && ((rcnow - created) > mon_down_mkfs_grace) && ((mcnow - starttime) > mon_down_uptime_grace)) { + auto q = mon.get_quorum(); + std::list details; for (int i=0; iget_name(i); - mon_downs.insert(mon_name); - ss << "mon." << mon_name << " (rank " << i - << ") addr " << mon.monmap->get_addrs(i) - << " is down (out of quorum)"; - d.detail.push_back(ss.str()); + ostringstream ss; + std::string mon_name = mon.monmap->get_name(i); + auto const& info = mon.monmap->get(mon_name); + if ((rcnow - info.time_added) > mon_down_added_grace) { + mon_downs.insert(mon_name); + ss << "mon." << mon_name << " (rank " << i + << ") addr " << mon.monmap->get_addrs(i) + << " is down (out of quorum)"; + details.push_back(ss.str()); + } } } + if (details.size()) { + ostringstream ss; + ss << (max-actual) << "/" << max << " mons down, quorum " << mon.get_quorum_names(); + auto& d = checks->add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual); + d.detail = std::move(details); + } } } -- 2.47.3