From bcee73003b86011a9f60361befcdf2b4f66dc127 Mon Sep 17 00:00:00 2001 From: Kamoltat Sirivadhna Date: Sun, 13 Jul 2025 20:26:49 +0000 Subject: [PATCH] src/mon/HealthMonitor: Add mon_netsplit_grace_period to suppress transient MON_NETSPLIT warnings MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When a monitor is elected leader and begins evaluating connectivity, it may detect temporary disconnections between monitors that have not yet fully reconnected to each other—particularly after events like monitor restarts, SIGSTOP/SIGCONT (as used in mon_thrash), or brief network blips. This can result in false-positive MON_NETSPLIT health warnings that quickly disappear within seconds as the cluster topology stabilizes. This commit introduces a configurable option: - mon_netsplit_grace_period (default: 9 seconds) When the leader observes a netsplit between two monitors or locations, it will wait for the grace period before raising a health warning. If the split resolves within this window, no warning is emitted. This reduces test flakiness and alert fatigue while preserving the accuracy of persistent MON_NETSPLIT detection. Fixes: https://tracker.ceph.com/issues/71344 Signed-off-by: Kamoltat Sirivadhna --- src/common/options/mon.yaml.in | 9 +- src/mon/HealthMonitor.cc | 186 +++++++++++++++++++++++++-------- src/mon/HealthMonitor.h | 5 +- 3 files changed, 155 insertions(+), 45 deletions(-) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 2c337fcea7072..f34ae3f4fe0e7 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -1412,4 +1412,11 @@ options: - mon flags: - runtime - \ No newline at end of file +- name: mon_netsplit_grace_period + type: secs + level: advanced + desc: Time (in seconds) to wait before emitting a MON_NETSPLIT + health warning. + default: 9 + services : + - mon diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 117e4e050e504..007b2e7a6f231 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -957,6 +957,10 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set> nsp_pairs = mon.elector.get_netsplit_peer_tracker(mons_down_ranks); if (nsp_pairs.empty()) { + pending_mon_netsplits.clear(); + pending_location_netsplits.clear(); + dout(30) << "No netsplit pairs found, clearing" + << " pending_mon_netsplits and pending_location_netsplits" << dendl; return; } // Pre-populate mon_loc_map & location_to_mons for each monitor, discarding monitors that are down, @@ -1080,15 +1084,112 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set> detected_location_netsplits; + std::set> detected_mon_netsplits; + // Check for location-level netsplits and remove individual-level netsplits + for (auto& kv : location_disconnects) { + auto& loc_pair = kv.first; // {dc1,dc2} + int disconnect_count = kv.second; // Number of disconnects between dc1 and dc2 + + // The expected number of disconnects between two locations + // is the product of the number of monitors in each location + int expected_disconnects = location_to_mons[loc_pair.first].size() * + location_to_mons[loc_pair.second].size(); + + // Report location-level netsplits + if (disconnect_count == expected_disconnects) { + detected_location_netsplits.insert(loc_pair); + // Remove individual monitor disconnects between these locations + for (const auto& mon1 : location_to_mons[loc_pair.first]) { + for (const auto& mon2 : location_to_mons[loc_pair.second]) { + // Normalize the order to erase the correct pair (can't use std::swap) + mon_disconnects.erase({std::min(mon1, mon2), std::max(mon1, mon2)}); + } + } + } + + } + // Report individual-level netsplits + for (auto& mon_pair : mon_disconnects) { + detected_mon_netsplits.insert(mon_pair); + } - // For debugging purposes: + // update/add/erase to pending_mon_netsplits and pending_location_netsplits + auto now = ceph::coarse_mono_clock::now(); + auto mon_netsplit_grace_period = g_conf().get_val("mon_netsplit_grace_period"); + auto pending_location_netsplits_end = pending_location_netsplits.end(); + auto pending_mon_netsplits_end = pending_mon_netsplits.end(); + list details; + + // Process location-level netsplits + for (const auto& nsp : detected_location_netsplits) { + auto loc_it = pending_location_netsplits.find(nsp); + if (loc_it != pending_location_netsplits_end) { + auto elapsed = now - loc_it->second; + if (elapsed >= mon_netsplit_grace_period) { + // Add MON_NETSPLIT detail and erase the netsplit + // from pending_location_netsplits + dout(20) << "Netsplit detected between " << loc_it->first.first + << " and " << loc_it->first.second + << ", elapsed time: " << elapsed + << " > mon_netsplit_grace_period: " << mon_netsplit_grace_period << dendl; + ostringstream ds; + ds << "Netsplit detected between " << loc_it->first.first + << " and " << loc_it->first.second; + details.push_back(ds.str()); + pending_location_netsplits.erase(loc_it); + } + } else { + // First time seeing the location-level netsplit + dout(20) << "First time seeing netsplit between " << nsp.first + << " and " << nsp.second << dendl; + // Add to pending_location_netsplits + pending_location_netsplits[nsp] = now; + } + } + + // Process monitor-level netsplits + for (const auto& mon_pair : detected_mon_netsplits) { + auto mon_it = pending_mon_netsplits.find(mon_pair); + if (mon_it != pending_mon_netsplits_end) { + auto elapsed = now - mon_it->second; + if (elapsed >= mon_netsplit_grace_period) { + // Add MON_NETSPLIT detail and erase the netsplit + // from pending_mon_netsplits + dout(20) << "Netsplit detected between mon." << mon_it->first.first + << " and mon." << mon_it->first.second + << ", elapsed time: " << elapsed + << " > mon_netsplit_grace_period: " << mon_netsplit_grace_period << dendl; + ostringstream ds; + ds << "Netsplit detected between mon." << mon_it->first.first + << " and mon." << mon_it->first.second; + details.push_back(ds.str()); + pending_mon_netsplits.erase(mon_it); + } + } else { + // First time seeing the monitor-level netsplit + dout(20) << "First time seeing netsplit between mon." << mon_pair.first + << " and mon." << mon_pair.second << dendl; + pending_mon_netsplits[mon_pair] = now; + } + } + + // Report health check if any details + if (!details.empty()) { + ostringstream ss; + ss << details.size() << " network partition" << (details.size() > 1 ? "s" : "") << " detected"; + auto& d = checks->add("MON_NETSPLIT", HEALTH_WARN, ss.str(), details.size()); + d.detail.swap(details); + } + if (mon.cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) { dout(30) << "mon_disconnects: {"; bool first = true; for (const auto& mon_pair : mon_disconnects) { if (!first) *_dout << ", "; - *_dout << "(" << mon_pair.first << ", " - << mon_pair.second << ") "; + first = false; + *_dout << "(" << mon_pair.first << ", " << mon_pair.second << ")"; } *_dout << "}" << dendl; @@ -1096,9 +1197,9 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set details; - for (auto& kv : location_disconnects) { - auto& loc_pair = kv.first; - int disconnect_count = kv.second; - - // The expected number of disconnects between two locations - // is the product of the number of monitors in each location - int expected_disconnects = location_to_mons[loc_pair.first].size() * - location_to_mons[loc_pair.second].size(); - // Report location-level netsplits - if (disconnect_count == expected_disconnects) { - ostringstream ds; - ds << "Netsplit detected between " << loc_pair.first << " and " << loc_pair.second; - details.push_back(ds.str()); - - // Remove individual monitor disconnects between these locations - for (const auto& mon1 : location_to_mons[loc_pair.first]) { - for (const auto& mon2 : location_to_mons[loc_pair.second]) { - // Normalize the order to erase the correct pair (can't use std::swap) - mon_disconnects.erase({std::min(mon1, mon2), std::max(mon1, mon2)}); - } - } + dout(30) << "detected_location_netsplits: {"; + bool first = true; + for (const auto& netsplit : detected_location_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first << ", " << netsplit.second << ")"; + first = false; } - } - // Report individual-level netsplits - for (auto& mon_pair : mon_disconnects) { - ostringstream ds; - ds << "Netsplit detected between mon." << mon_pair.first << " and mon." << mon_pair.second; - details.push_back(ds.str()); - } + *_dout << "}" << dendl; + + dout(30) << "detected_mon_netsplits: {"; + bool first = true; + for (const auto& netsplit : detected_mon_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first << ", " << netsplit.second << ")"; + first = false; + } + *_dout << "}" << dendl; - // Report health check if any details - if (!details.empty()) { - ostringstream ss; - ss << details.size() << " network partition" << (details.size() > 1 ? "s" : "") << " detected"; - auto& d = checks->add("MON_NETSPLIT", HEALTH_WARN, ss.str(), details.size()); - d.detail.swap(details); + dout(30) << "pending_location_netsplits: {"; + bool first = true; + for (const auto& netsplit : pending_location_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first.first << ", " << netsplit.first.second + << "): " << netsplit.second; + first = false; + } + *_dout << "}" << dendl; + + dout(30) << "pending_mon_netsplits: {"; + bool first = true; + for (const auto& netsplit : pending_mon_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first.first << ", " << netsplit.first.second + << "): " << netsplit.second; + first = false; + } + *_dout << "}" << dendl; } } diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h index b0315d6afbca1..75a6ef3ae7e90 100644 --- a/src/mon/HealthMonitor.h +++ b/src/mon/HealthMonitor.h @@ -22,7 +22,10 @@ class HealthMonitor : public PaxosService std::map quorum_checks; // for each quorum member health_check_map_t leader_checks; // leader only std::map mutes; - + // location level netsplit pairs to elasped time + std::map, ceph::coarse_mono_clock::time_point> pending_location_netsplits; + // individual level netsplit pairs to elasped time + std::map, ceph::coarse_mono_clock::time_point> pending_mon_netsplits; std::map pending_mutes; public: -- 2.39.5