From 96f15cee41c47eaf02e5da25d55a5b94df8a35b7 Mon Sep 17 00:00:00 2001 From: Kamoltat Sirivadhna Date: Thu, 15 Aug 2024 20:25:43 +0000 Subject: [PATCH] HealthMonitor: Add topology-aware netsplit detection and warning MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Problem: Currently, Ceph cannot detect and report network partitions (netsplits) between monitors in different topology locations in a consolidated way. While stretch mode can handle partitions through monitor elections, users lack visibility into the topology-level view of network disconnections, making troubleshooting difficult. Solution: This implementation adds a hierarchical netsplit detection mechanism that: - Uses DirectedGraph structure for netsplit detection - Maps monitor disconnections to relevant CRUSH topology levels - Aggregates individual disconnections into location-level reports when appropriate - Detects complete location-level netsplits when ALL monitors between locations cannot communicate - Reports specific topology locations experiencing complete communication failures - Falls back to individual monitor-level reporting for partial disconnections - Handles monitors with missing location data gracefully - Leverages HealthMonitor::check_for_mon_down to receive a set of down monitors, efficiently avoiding false netsplit reports for monitors already known to be down - Implements smart filtering that correctly excludes down monitors from location-based analysis, ensuring accurate netsplit reporting at both individual and topology levels The implementation produces user-friendly health warnings: 1. For complete location netsplits: "Netsplit detected between dc1 and dc2" 2. For individual monitor disconnections: "Netsplit detected between mon.a and mon.d" Performance considerations: - Time complexity: O(m²) where m is the number of monitors - Space complexity: O(m²) for connection tracking - Practical impact is minimal as monitor count is typically small (3-7) Fixes: https://tracker.ceph.com/issues/67371 Signed-off-by: Kamoltat Sirivadhna Conflicts: src/mon/Elector.cc - Trivial Fix --- src/mon/ConnectionTracker.cc | 164 ++++++++++++++++++++++ src/mon/ConnectionTracker.h | 21 +++ src/mon/Elector.cc | 5 + src/mon/Elector.h | 3 + src/mon/HealthMonitor.cc | 255 ++++++++++++++++++++++++++++++++++- src/mon/HealthMonitor.h | 3 +- 6 files changed, 447 insertions(+), 4 deletions(-) diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc index 52bfd02af81..513d8230119 100644 --- a/src/mon/ConnectionTracker.cc +++ b/src/mon/ConnectionTracker.cc @@ -21,6 +21,10 @@ #undef dout_prefix #define dout_prefix _prefix(_dout, rank, epoch, version) +static std::ostream& _dgraph_prefix(std::ostream *_dout, CephContext *cct) { + return *_dout << "DirectedGraph "; +} + static std::ostream& _prefix(std::ostream *_dout, int rank, epoch_t epoch, uint64_t version) { return *_dout << "rank: " << rank << " version: "<< version << " ConnectionTracker(" << epoch << ") "; } @@ -270,6 +274,166 @@ void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank) increase_version(); } +#undef dout_prefix +#define dout_prefix _dgraph_prefix(_dout, cct) + +void DirectedGraph::add_outgoing_edge(unsigned from, unsigned to) +{ + if (outgoing_edges[from].find(to) == outgoing_edges[from].end()) { + outgoing_edges[from].insert(to); + } else { + ldout(cct, 30) << "Outgoing edge from " << from << " to " << to + << " already exists in the graph" << dendl; + } +} + +void DirectedGraph::add_incoming_edge(unsigned to, unsigned from) +{ + if (incoming_edges[to].find(from) == incoming_edges[to].end()) { + incoming_edges[to].insert(from); + } else { + ldout(cct, 30) << "Incoming edge to " << to << " from " << to + << " already exists in the graph" << dendl; + } +} + +bool DirectedGraph::has_outgoing_edge(unsigned from, unsigned to) const +{ + auto from_it = outgoing_edges.find(from); + if (from_it == outgoing_edges.end()) { + ldout(cct, 30) << "Node " << from + << " has no outgoing edges" << dendl; + return false; + } + return from_it->second.find(to) != from_it->second.end(); +} + +bool DirectedGraph::has_incoming_edge(unsigned to, unsigned from) const +{ + auto to_it = incoming_edges.find(to); + if (to_it == incoming_edges.end()) { + ldout(cct, 30) << "Node " << to + << " has no incoming edges" << dendl; + return false; + } + return to_it->second.find(from) != to_it->second.end(); +} + +#undef dout_prefix +#define dout_prefix _prefix(_dout, rank, epoch, version) + +std::set> ConnectionTracker::get_netsplit( + std::set &mons_down) +{ + ldout(cct, 30) << __func__ << dendl; + /* + * The netsplit detection algorithm is as follows: + * 1. Build a directed connectivity graph from peer reports and my reports, + * excluding down monitors. + * 2. Find missing connections (partitions). + * 3. Return the set of pairs of monitors that are in a netsplit. + * O(m^2) time complexity, where m is the number of monitors. + * O(m^2) space complexity. + */ + // Step 1: Build a directed connectivity graph + // from peer reports and my reports. Exclude down monitors. + // peer_reports: + // 1: {current={0:true,2:true},history={0:0.93,2:0.99},epoch=1,epoch_version=1}, + // 2: {current={0:true,1:true},history={0:0.93,1:0.85},epoch=1,epoch_version=1} + // O(m^2) time complexity, where m is the number of monitors + auto mons_down_end = mons_down.end(); + peer_reports[rank] = my_reports; + DirectedGraph bdg(cct); + for (const auto& [reporter_rank, report] : peer_reports) { + if (reporter_rank < 0) continue; + if (mons_down.find(reporter_rank) != mons_down_end) { + ldout(cct, 30) << "Skipping down monitor: " << reporter_rank << dendl; + continue; + } + for (const auto& [peer_rank, is_connected] : report.current) { + if (peer_rank < 0) continue; + if (mons_down.find(peer_rank) != mons_down_end) { + ldout(cct, 30) << "Skipping down monitor: " << peer_rank << dendl; + continue; + } + if (is_connected) { + bdg.add_outgoing_edge(reporter_rank, peer_rank); + bdg.add_incoming_edge(peer_rank, reporter_rank); + } + } + } + // For debugging purposes: + if (cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) { + ldout(cct, 30) << "Directed graph: " << dendl; + + ldout(cct, 30) << "Outgoing edges: {"; + bool outer_first = true; + for (const auto& [node, edges] : bdg.outgoing_edges) { + if (!outer_first) *_dout << ", "; + outer_first = false; + *_dout << node << " -> {"; + bool inner_first = true; + for (const auto& edge : edges) { + if (!inner_first) *_dout << ", "; + inner_first = false; + *_dout << edge; + } + *_dout << "}"; + } + *_dout << "}" << dendl; + + ldout(cct, 30) << "Incoming edges: {"; + bool outer_first = true; + for (const auto& [node, edges] : bdg.incoming_edges) { + if (!outer_first) *_dout << ", "; + outer_first = false; + *_dout << node << " <- {"; + bool inner_first = true; + for (const auto& edge : edges) { + if (!inner_first) *_dout << ", "; + inner_first = false; + *_dout << edge; + } + *_dout << "}"; + } + *_dout << "}" << dendl; + } + // Step 2: Find missing connections (partitions) + // Only consider it a partition if both node and peer doesn't + // have edges to each other AND have > 0 incoming edges. + // looping through incoming edges garantees that we are not + // considering a node without incoming edges as a partition. + // As for nodes that are not in quourm, they are already exlcuded + // in the previous step. + // O(m^2) time complexity, where m is the number of monitors + std::set> nsp_pairs; + for (const auto& [node, _] : bdg.incoming_edges) { + for (const auto& [peer, _] : bdg.incoming_edges) { + // Skip self-connections + if (node == peer) continue; + // Check for bidirectional communication failure + if (!bdg.has_outgoing_edge(node, peer) && + !bdg.has_outgoing_edge(peer, node) && + !bdg.has_incoming_edge(node, peer) && + !bdg.has_incoming_edge(peer, node)) { + // Normalize order to avoid duplicates + unsigned first = std::min(node, peer); + unsigned second = std::max(node, peer); + nsp_pairs.insert(std::make_pair(first, second)); + } + } + } + // For debugging purposes: + if (cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) { + ldout(cct, 30) << "Netsplit pairs: " << dendl; + for (const auto& nsp_pair : nsp_pairs) { + ldout(cct, 30) << "(" << nsp_pair.first << ", " + << nsp_pair.second << ") " << dendl; + } + } + return nsp_pairs; +} + bool ConnectionTracker::is_clean(int mon_rank, int monmap_size) { ldout(cct, 30) << __func__ << dendl; diff --git a/src/mon/ConnectionTracker.h b/src/mon/ConnectionTracker.h index c1a32c08fe0..31084d09e24 100644 --- a/src/mon/ConnectionTracker.h +++ b/src/mon/ConnectionTracker.h @@ -51,6 +51,21 @@ struct ConnectionReport { }; WRITE_CLASS_ENCODER(ConnectionReport); +struct DirectedGraph { + // The set of nodes in the graph + // works with only non-negative ranks + // because we only run the algorithm when + // all monitors have valid ranks. + std::map> outgoing_edges; + std::map> incoming_edges; + CephContext *cct; + DirectedGraph(CephContext *c) : cct(c) {} + void add_outgoing_edge(unsigned from, unsigned to); + void add_incoming_edge(unsigned to, unsigned from); + bool has_outgoing_edge(unsigned from, unsigned to) const; + bool has_incoming_edge(unsigned to, unsigned from) const; +}; + class RankProvider { public: /** @@ -127,6 +142,12 @@ class ConnectionTracker { * current and history of each peer_report. */ bool is_clean(int mon_rank, int monmap_size); + /** + * Get the set of monitor pairs that are disconnected + * due to network partitions. + * This is a set of pairs (rank1, rank2) where rank1 < rank2. + */ + std::set> get_netsplit(std::set &mons_down); /** * Encode this ConnectionTracker. Useful both for storing on disk * and for sending off to peers for decoding and import diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 79cff4ca647..f40cf887def 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -771,6 +771,11 @@ bool Elector::peer_tracker_is_clean() return peer_tracker.is_clean(mon->rank, paxos_size()); } +std::set> Elector::get_netsplit_peer_tracker(std::set &mons_down) +{ + return peer_tracker.get_netsplit(mons_down); +} + bool Elector::is_tiebreaker(int rank) const { return mon->monmap->tiebreaker_mon == mon->monmap->get_name(rank); diff --git a/src/mon/Elector.h b/src/mon/Elector.h index e4a11b90190..ba6e46d6330 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -379,6 +379,9 @@ class Elector : public ElectionOwner, RankProvider { * https://tracker.ceph.com/issues/58049 */ bool peer_tracker_is_clean(); + + std::set> get_netsplit_peer_tracker(std::set &mons_down); + /** * Forget everything about our peers. :( */ diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 2a78c46af82..117e4e050e5 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -733,8 +733,11 @@ bool HealthMonitor::check_leader_health() if (g_conf().get_val("mon_warn_on_older_version")) { check_for_older_version(&next); } + std::set mons_down; // MON_DOWN - check_for_mon_down(&next); + check_for_mon_down(&next, mons_down); + // MON_NETSPLIT + check_netsplit(&next, mons_down); // MON_CLOCK_SKEW check_for_clock_skew(&next); // MON_MSGR2_NOT_ENABLED @@ -802,7 +805,7 @@ void HealthMonitor::check_for_older_version(health_check_map_t *checks) } } -void HealthMonitor::check_for_mon_down(health_check_map_t *checks) +void HealthMonitor::check_for_mon_down(health_check_map_t *checks, std::set &mon_downs) { int max = mon.monmap->size(); int actual = mon.get_quorum().size(); @@ -822,7 +825,9 @@ void HealthMonitor::check_for_mon_down(health_check_map_t *checks) for (int i=0; iget_name(i) << " (rank " << i + std::string mon_name = mon.monmap->get_name(i); + mon_downs.insert(mon_name); + ss << "mon." << mon_name << " (rank " << i << ") addr " << mon.monmap->get_addrs(i) << " is down (out of quorum)"; d.detail.push_back(ss.str()); @@ -918,3 +923,247 @@ void HealthMonitor::check_mon_crush_loc_stretch_mode(health_check_map_t *checks) d.detail.swap(details); } } + +void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set &mons_down) +{ + /** + * Check for netsplits between monitors and report them in a topology-aware manner + * + * This function detects network partitions between monitors and reports them as: + * - Location-level netsplits: When ALL monitors in one location cannot communicate + * with ALL monitors in another location, this is reported as a location-level + * netsplit (e.g., "Netsplit detected between dc1 and dc2") + * - Individual-level netsplits: When only specific monitors are disconnected + * and not following location boundaries, these are reported individually + * (e.g., "Netsplit detected between mon.a and mon.d") + * + * The function identifies the highest relevant topology level (zone, datacenter, etc.) + * when reporting location-level netsplits, to give operators the most useful information + * for troubleshooting network issues. + * + * Time Complexity: O(m^2) + * Space Complexity: O(m^2) + * where m is the number of monitors in the monmap. + */ + dout(20) << __func__ << dendl; + if (mon.monmap->size() < 3 || mon.monmap->strategy != MonMap::CONNECTIVITY) { + return; + } + std::set mons_down_ranks; + for (const auto& mon_name : mons_down) { + mons_down_ranks.insert(mon.monmap->get_rank(mon_name)); + } + // Get netsplit pairs early to avoid unnecessary work if no netsplits exist. + // O(m^2) + std::set> nsp_pairs = mon.elector.get_netsplit_peer_tracker(mons_down_ranks); + if (nsp_pairs.empty()) { + return; + } + // Pre-populate mon_loc_map & location_to_mons for each monitor, discarding monitors that are down, + // and sort the crush location highest to lowest by type id in the cluster topology. + // Sort takes O(1) since the number of locations in the cluster topology is fixed. + // OSDMap::_build_crush_types defines the hierarchy + // (root > region > datacenter > room > ...) which allows us to sort in + // descending order and report netsplits at the highest (most significant) + // level of the topology where monitors differ. + // Time Complexity: O(m) + // Space Complexity: O(m) + std::map> location_to_mons; + std::unordered_map>> mon_loc_map; + for (auto &mon_info : mon.monmap->mon_info) { + // Create a vector of pairs + std::vector> sorted_crush_loc_vec; + for (const auto& item : mon_info.second.crush_loc) { + sorted_crush_loc_vec.push_back(item); + } + // Sort the vector by type id + std::sort(sorted_crush_loc_vec.begin(), sorted_crush_loc_vec.end(), + [this](const std::pair &a, + const std::pair &b) { + auto a_type_id = mon.osdmon()->osdmap.crush->get_validated_type_id(a.first); + auto b_type_id = mon.osdmon()->osdmap.crush->get_validated_type_id(b.first); + // Handle missing type IDs gracefully + // If 'a' is invalid, it should come AFTER valid entries + if (!a_type_id.has_value()) { + dout(0) << "ERROR: Monitor CRUSH location type '" << a.first + << "' not found in CRUSH map" << dendl; + return false; + } + // If 'b' is invalid, it should come AFTER valid entries + if (!b_type_id.has_value()) { + dout(0) << "ERROR: Monitor CRUSH location type '" << b.first + << "' not found in CRUSH map" << dendl; + return true; + } + // Both have valid type IDs, sort by ID (higher IDs first) + return *a_type_id > *b_type_id; + }); + // Store in mon_loc_map + const std::string& mon_name = mon_info.second.name; + mon_loc_map[mon_name] = sorted_crush_loc_vec; + // Group monitors by location of their highest CRUSH bucket-type + // Discard monitors that are down or have no location + if (!sorted_crush_loc_vec.empty()) { + if (!mons_down.count(mon_name)) { + auto& highest_loc = sorted_crush_loc_vec.front(); + location_to_mons[highest_loc.second].insert(mon_name); + } else { + dout(30) << "mon: " << mon_name << " is down" << dendl; + } + } else { + dout(30) << "mon: " << mon_name << " has no location" << dendl; + } + } + + // retrieve the netsplit pairs and check for the highest common CRUSH + // bucket-type between the two monitors in the pair. + auto mon_loc_map_end = mon_loc_map.end(); + std::map, int> location_disconnects; + std::set> mon_disconnects; + for (auto &rank_pair : nsp_pairs) { + std::string first_mon = mon.monmap->get_name(rank_pair.first); + std::string second_mon = mon.monmap->get_name(rank_pair.second); + if (first_mon.empty()) { + dout(10) << "Failed to get mon name for rank " << rank_pair.first + << ", it might no longer exist in the monmap" << dendl; + continue; + } + if (second_mon.empty()) { + dout(10) << "Failed to get mon name for rank " << rank_pair.second + << ", it might no longer exist in the monmap" << dendl; + continue; + } + // Skip if either monitor is down ... although this should not happen + // if the connection scores that nsp_pairs is built from is correct. + if (mons_down.count(first_mon)) { + dout(10) << "mon: " << first_mon + << " is down; something is wrong with connection scores" << dendl; + continue; + } + if (mons_down.count(second_mon)) { + dout(10) << "mon: " << second_mon + << " is down; something is wrong with connection scores" << dendl; + continue; + } + // Skip if either monitor is not found in mon_loc_map + auto first_mon_loc_it = mon_loc_map.find(first_mon); + auto second_mon_loc_it = mon_loc_map.find(second_mon); + if (first_mon_loc_it == mon_loc_map_end) { + dout(10) << "Failed to locate mon: " << first_mon + << " might no longer exist in the monmap" << dendl; + continue; + } + if (second_mon_loc_it == mon_loc_map_end) { + dout(10) << "Failed to locate mon: " << second_mon + << " might no longer exist in the monmap" << dendl; + continue; + } + // If either monitor has no location, add to the individual-level netsplit report + if (first_mon_loc_it->second.empty() || second_mon_loc_it->second.empty()) { + if (first_mon > second_mon) std::swap(first_mon, second_mon); + mon_disconnects.insert({first_mon, second_mon}); + continue; + } + // Get the highest CRUSH bucket-type location for each monitor + std::string first_mon_highest_loc = first_mon_loc_it->second.front().second; + std::string second_mon_highest_loc = second_mon_loc_it->second.front().second; + // If the monitors are in the same location, add to the individual-level netsplit report + if (first_mon_highest_loc == second_mon_highest_loc) { + if (first_mon > second_mon) std::swap(first_mon, second_mon); + mon_disconnects.insert({first_mon, second_mon}); + continue; + } + // Else add to the location-level netsplit report + if (first_mon_highest_loc > second_mon_highest_loc) std::swap(first_mon_highest_loc, second_mon_highest_loc); + if (first_mon > second_mon) std::swap(first_mon, second_mon); + // Count the disconnects between the two monitors and locations + location_disconnects[{first_mon_highest_loc, second_mon_highest_loc}]++; + mon_disconnects.insert({first_mon, second_mon}); + } + + // For debugging purposes: + if (mon.cct->_conf->subsys.should_gather(ceph_subsys_mon, 30)) { + dout(30) << "mon_disconnects: {"; + bool first = true; + for (const auto& mon_pair : mon_disconnects) { + if (!first) *_dout << ", "; + *_dout << "(" << mon_pair.first << ", " + << mon_pair.second << ") "; + } + *_dout << "}" << dendl; + + dout(30) << "location_disconnects: {"; + bool first = true; + for (const auto& loc_pair : location_disconnects) { + if (!first) *_dout << ", "; + *_dout << "(" << loc_pair.first.first << ", " + << loc_pair.first.second << "): " + << loc_pair.second; + } + *_dout << "}" << dendl; + + dout(30) << "mon_loc_map: " << dendl; + for (const auto& mon_pair : mon_loc_map) { + dout(30) << mon_pair.first << ": {"; + bool first = true; + for (const auto& loc_pair : mon_pair.second) { + if (!first) *_dout << ", "; + first = false; + *_dout << loc_pair.first << ": " << loc_pair.second; + } + *_dout << "}" << dendl; + } + + dout(30) << "location_to_mons: " << dendl; + for (const auto& loc_pair : location_to_mons) { + dout(30) << loc_pair.first << ": {"; + bool first = true; + for (const auto& monitor : loc_pair.second) { + if (!first) *_dout << ", "; + first = false; + *_dout << monitor; + } + *_dout << "}" << dendl; + } + } + + // Check for location-level netsplits and remove individual-level netsplits + list details; + for (auto& kv : location_disconnects) { + auto& loc_pair = kv.first; + int disconnect_count = kv.second; + + // The expected number of disconnects between two locations + // is the product of the number of monitors in each location + int expected_disconnects = location_to_mons[loc_pair.first].size() * + location_to_mons[loc_pair.second].size(); + // Report location-level netsplits + if (disconnect_count == expected_disconnects) { + ostringstream ds; + ds << "Netsplit detected between " << loc_pair.first << " and " << loc_pair.second; + details.push_back(ds.str()); + + // Remove individual monitor disconnects between these locations + for (const auto& mon1 : location_to_mons[loc_pair.first]) { + for (const auto& mon2 : location_to_mons[loc_pair.second]) { + // Normalize the order to erase the correct pair (can't use std::swap) + mon_disconnects.erase({std::min(mon1, mon2), std::max(mon1, mon2)}); + } + } + } + } + // Report individual-level netsplits + for (auto& mon_pair : mon_disconnects) { + ostringstream ds; + ds << "Netsplit detected between mon." << mon_pair.first << " and mon." << mon_pair.second; + details.push_back(ds.str()); + } + + // Report health check if any details + if (!details.empty()) { + ostringstream ss; + ss << details.size() << " network partition" << (details.size() > 1 ? "s" : "") << " detected"; + auto& d = checks->add("MON_NETSPLIT", HEALTH_WARN, ss.str(), details.size()); + d.detail.swap(details); + } +} diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h index 2182b6bbfce..b0315d6afbc 100644 --- a/src/mon/HealthMonitor.h +++ b/src/mon/HealthMonitor.h @@ -64,10 +64,11 @@ private: bool prepare_command(MonOpRequestRef op); bool prepare_health_checks(MonOpRequestRef op); void check_for_older_version(health_check_map_t *checks); - void check_for_mon_down(health_check_map_t *checks); + void check_for_mon_down(health_check_map_t *checks, std::set &mons_down); void check_for_clock_skew(health_check_map_t *checks); void check_mon_crush_loc_stretch_mode(health_check_map_t *checks); void check_if_msgr2_enabled(health_check_map_t *checks); + void check_netsplit(health_check_map_t *checks, std::set &mons_down); bool check_leader_health(); bool check_member_health(); bool check_mutes(); -- 2.39.5