From b297babca37ab73c46cf26f5ede80fec9e9dd04c Mon Sep 17 00:00:00 2001 From: Kamoltat Sirivadhna Date: Sun, 13 Jul 2025 20:26:49 +0000 Subject: [PATCH] src/mon/HealthMonitor: Add mon_netsplit_grace_period to suppress transient MON_NETSPLIT warnings MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When a monitor is elected leader and begins evaluating connectivity, it may detect temporary disconnections between monitors that have not yet fully reconnected to each other—particularly after events like monitor restarts, SIGSTOP/SIGCONT (as used in mon_thrash), or brief network blips. This can result in false-positive MON_NETSPLIT health warnings that quickly disappear within seconds as the cluster topology stabilizes. This commit introduces a configurable option: - mon_netsplit_grace_period (default: 9 seconds) When the leader observes a netsplit between two monitors or locations, it will wait for the grace period before raising a health warning. If the split resolves within this window, no warning is emitted. This reduces test flakiness and alert fatigue while preserving the accuracy of persistent MON_NETSPLIT detection. Fixes: https://tracker.ceph.com/issues/71344 Signed-off-by: Kamoltat Sirivadhna Conflicts: src/common/options/mon.yaml.in - trivial fix --- src/common/options/mon.yaml.in | 7 + src/mon/HealthMonitor.cc | 227 ++++++++++++++++++++++++++------- src/mon/HealthMonitor.h | 9 +- 3 files changed, 197 insertions(+), 46 deletions(-) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 15922b4dbdb..af9e7e842c0 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -1419,6 +1419,13 @@ options: should trigger a panic if it does not receive the initial map from the monitor default: 30 services: +- name: mon_netsplit_grace_period + type: secs + level: advanced + desc: Time (in seconds) to wait before emitting a MON_NETSPLIT + health warning. + default: 9 + services : - mon - name: pool_availability_update_interval type: float diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index ec98ff795ec..de8da7b524e 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -942,14 +942,26 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::setsize() < 3 || mon.monmap->strategy != MonMap::CONNECTIVITY) { + if (mon.monmap->size() < 3) { + dout(10) << "Insufficient monitors for netsplit detection" << dendl; + return; + } + + if (mon.monmap->strategy != MonMap::CONNECTIVITY) { + dout(10) << "Monitor strategy is not CONNECTIVITY, skipping netsplit check" << dendl; return; } + std::set mons_down_ranks; for (const auto& mon_name : mons_down) { mons_down_ranks.insert(mon.monmap->get_rank(mon_name)); @@ -958,6 +970,13 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set> nsp_pairs = mon.elector.get_netsplit_peer_tracker(mons_down_ranks); if (nsp_pairs.empty()) { + pending_mon_netsplits.clear(); + pending_location_netsplits.clear(); + current_mon_netsplits.clear(); + current_location_netsplits.clear(); + dout(30) << "No netsplit pairs found, clearing" + << " pending_mon_netsplits, pending_location_netsplits" + << " current_mon_netsplits, current_location_netsplits" << dendl; return; } // Pre-populate mon_loc_map & location_to_mons for each monitor, discarding monitors that are down, @@ -1081,15 +1100,134 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set> detected_location_netsplits; + std::set> detected_mon_netsplits; + // Check for location-level netsplits and remove individual-level netsplits + for (auto& kv : location_disconnects) { + auto& loc_pair = kv.first; // {dc1,dc2} + int disconnect_count = kv.second; // Number of disconnects between dc1 and dc2 + + // The expected number of disconnects between two locations + // is the product of the number of monitors in each location + int expected_disconnects = location_to_mons[loc_pair.first].size() * + location_to_mons[loc_pair.second].size(); + + // Report location-level netsplits + if (disconnect_count == expected_disconnects) { + detected_location_netsplits.insert(loc_pair); + // Remove individual monitor disconnects between these locations + for (const auto& mon1 : location_to_mons[loc_pair.first]) { + for (const auto& mon2 : location_to_mons[loc_pair.second]) { + // Normalize the order to erase the correct pair (can't use std::swap) + mon_disconnects.erase({std::min(mon1, mon2), std::max(mon1, mon2)}); + } + } + } + + } + // Report individual-level netsplits + for (auto& mon_pair : mon_disconnects) { + detected_mon_netsplits.insert(mon_pair); + } - // For debugging purposes: + // update/add/erase to pending_mon_netsplits and pending_location_netsplits + auto now = ceph::coarse_mono_clock::now(); + auto mon_netsplit_grace_period = g_conf().get_val("mon_netsplit_grace_period"); + auto pending_location_netsplits_end = pending_location_netsplits.end(); + auto pending_mon_netsplits_end = pending_mon_netsplits.end(); + list details; + + // Process location-level netsplits + for (const auto& nsp : detected_location_netsplits) { + auto loc_it = pending_location_netsplits.find(nsp); + if (loc_it != pending_location_netsplits_end) { + auto elapsed = now - loc_it->second; + if (elapsed >= mon_netsplit_grace_period) { + // Add MON_NETSPLIT detail, erase the netsplit + // from pending_location_netsplits and move to current_location_netsplits + dout(20) << "Netsplit detected between " << loc_it->first.first + << " and " << loc_it->first.second + << ", elapsed time: " << elapsed + << " > mon_netsplit_grace_period: " << mon_netsplit_grace_period << dendl; + ostringstream ds; + ds << "Netsplit detected between " << loc_it->first.first + << " and " << loc_it->first.second; + details.push_back(ds.str()); + pending_location_netsplits.erase(loc_it); + current_location_netsplits[nsp] = now; + } + } else if (current_location_netsplits.count(nsp)) { + // Can't find in pending_location_netsplits, but found in current_location_netsplits + // This means the netsplit is still ongoing, so we continue reporting it + dout(20) << "Ongoing netsplit between " << nsp.first + << " and " << nsp.second << " duration: " + << (now - current_location_netsplits[nsp]) << dendl; + ostringstream ds; + ds << "Netsplit detected between " << nsp.first + << " and " << nsp.second; + details.push_back(ds.str()); + } else { + // First time seeing the location-level netsplit + dout(20) << "First time seeing netsplit between " << nsp.first + << " and " << nsp.second << dendl; + // Add to pending_location_netsplits + pending_location_netsplits[nsp] = now; + } + } + + // Process monitor-level netsplits + for (const auto& mon_pair : detected_mon_netsplits) { + auto mon_it = pending_mon_netsplits.find(mon_pair); + if (mon_it != pending_mon_netsplits_end) { + auto elapsed = now - mon_it->second; + if (elapsed >= mon_netsplit_grace_period) { + // Add MON_NETSPLIT detail, erase the netsplit + // from pending_mon_netsplits and move to current_mon_netsplits + dout(20) << "Netsplit detected between mon." << mon_it->first.first + << " and mon." << mon_it->first.second + << ", elapsed time: " << elapsed + << " > mon_netsplit_grace_period: " << mon_netsplit_grace_period << dendl; + ostringstream ds; + ds << "Netsplit detected between mon." << mon_it->first.first + << " and mon." << mon_it->first.second; + details.push_back(ds.str()); + pending_mon_netsplits.erase(mon_it); + current_mon_netsplits[mon_pair] = now; + } + } else if (current_mon_netsplits.count(mon_pair)) { + // Can't find in pending_mon_netsplits, but found in current_mon_netsplits + // This means the netsplit is still ongoing, so we continue reporting it + dout(20) << "Ongoing netsplit between mon." << mon_pair.first + << " and mon." << mon_pair.second + << " duration: " << (now - current_mon_netsplits[mon_pair]) << dendl; + ostringstream ds; + ds << "Netsplit detected between mon." << mon_pair.first + << " and mon." << mon_pair.second; + details.push_back(ds.str()); + } else { + // First time seeing the monitor-level netsplit + dout(20) << "First time seeing netsplit between mon." << mon_pair.first + << " and mon." << mon_pair.second << dendl; + pending_mon_netsplits[mon_pair] = now; + } + } + + // Report health check if any details + if (!details.empty()) { + ostringstream ss; + ss << details.size() << " network partition" << (details.size() > 1 ? "s" : "") << " detected"; + auto& d = checks->add("MON_NETSPLIT", HEALTH_WARN, ss.str(), details.size()); + d.detail.swap(details); + } + if (mon.cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) { dout(30) << "mon_disconnects: {"; bool first = true; for (const auto& mon_pair : mon_disconnects) { if (!first) *_dout << ", "; - *_dout << "(" << mon_pair.first << ", " - << mon_pair.second << ") "; + first = false; + *_dout << "(" << mon_pair.first << ", " << mon_pair.second << ")"; } *_dout << "}" << dendl; @@ -1097,9 +1235,9 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set details; - for (auto& kv : location_disconnects) { - auto& loc_pair = kv.first; - int disconnect_count = kv.second; - - // The expected number of disconnects between two locations - // is the product of the number of monitors in each location - int expected_disconnects = location_to_mons[loc_pair.first].size() * - location_to_mons[loc_pair.second].size(); - // Report location-level netsplits - if (disconnect_count == expected_disconnects) { - ostringstream ds; - ds << "Netsplit detected between " << loc_pair.first << " and " << loc_pair.second; - details.push_back(ds.str()); - - // Remove individual monitor disconnects between these locations - for (const auto& mon1 : location_to_mons[loc_pair.first]) { - for (const auto& mon2 : location_to_mons[loc_pair.second]) { - // Normalize the order to erase the correct pair (can't use std::swap) - mon_disconnects.erase({std::min(mon1, mon2), std::max(mon1, mon2)}); - } - } + dout(30) << "detected_location_netsplits: {"; + bool first = true; + for (const auto& netsplit : detected_location_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first << ", " << netsplit.second << ")"; + first = false; } - } - // Report individual-level netsplits - for (auto& mon_pair : mon_disconnects) { - ostringstream ds; - ds << "Netsplit detected between mon." << mon_pair.first << " and mon." << mon_pair.second; - details.push_back(ds.str()); - } + *_dout << "}" << dendl; + + dout(30) << "detected_mon_netsplits: {"; + bool first = true; + for (const auto& netsplit : detected_mon_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first << ", " << netsplit.second << ")"; + first = false; + } + *_dout << "}" << dendl; - // Report health check if any details - if (!details.empty()) { - ostringstream ss; - ss << details.size() << " network partition" << (details.size() > 1 ? "s" : "") << " detected"; - auto& d = checks->add("MON_NETSPLIT", HEALTH_WARN, ss.str(), details.size()); - d.detail.swap(details); + dout(30) << "pending_location_netsplits: {"; + bool first = true; + for (const auto& netsplit : pending_location_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first.first << ", " << netsplit.first.second + << "): " << netsplit.second; + first = false; + } + *_dout << "}" << dendl; + + dout(30) << "pending_mon_netsplits: {"; + bool first = true; + for (const auto& netsplit : pending_mon_netsplits) { + if (!first) *_dout << ", "; + *_dout << "(" << netsplit.first.first << ", " << netsplit.first.second + << "): " << netsplit.second; + first = false; + } + *_dout << "}" << dendl; } -} +} \ No newline at end of file diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h index 27651232340..1030bf58220 100644 --- a/src/mon/HealthMonitor.h +++ b/src/mon/HealthMonitor.h @@ -26,7 +26,14 @@ class HealthMonitor : public PaxosService std::map quorum_checks; // for each quorum member health_check_map_t leader_checks; // leader only std::map mutes; - + // location level netsplit pairs to elasped time + std::map, ceph::coarse_mono_clock::time_point> pending_location_netsplits; + // individual level netsplit pairs to elasped time + std::map, ceph::coarse_mono_clock::time_point> pending_mon_netsplits; + // currently active location netsplits with their elapsed time + std::map, ceph::coarse_mono_clock::time_point> current_location_netsplits; + // currently active monitor netsplits with their elapsed time + std::map, ceph::coarse_mono_clock::time_point> current_mon_netsplits; std::map pending_mutes; public: -- 2.39.5