From f99b8719ba8a20e88c6e32fd8b8fe4fabdb1fbc1 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 24 Jun 2025 12:21:55 -0400 Subject: [PATCH] mon/HealthMonitor: refactor quorum_checks/leader_checks as PaxosMap To codify protocol and catch bugs. Signed-off-by: Patrick Donnelly --- src/mon/HealthMonitor.cc | 85 +++++++++++++++++++++++----------------- src/mon/HealthMonitor.h | 6 ++- 2 files changed, 53 insertions(+), 38 deletions(-) diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 0c83b3c96f7..9e058a28847 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -72,7 +72,10 @@ static ostream& _prefix(std::ostream *_dout, const Monitor &mon, } HealthMonitor::HealthMonitor(Monitor &m, Paxos &p, const string& service_name) - : PaxosService(m, p, service_name) { + : PaxosService(m, p, service_name) + , quorum_checks(m, *this) + , leader_checks(m, *this) +{ } void HealthMonitor::init() @@ -94,7 +97,7 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap) mon.store->get(service_name, "quorum", qbl); if (qbl.length()) { auto p = qbl.cbegin(); - decode(quorum_checks, p); + quorum_checks.decode(p); } else { quorum_checks.clear(); } @@ -103,7 +106,7 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap) mon.store->get(service_name, "leader", lbl); if (lbl.length()) { auto p = lbl.cbegin(); - decode(leader_checks, p); + leader_checks.decode(p); } else { leader_checks.clear(); } @@ -123,12 +126,12 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap) JSONFormatter jf(true); jf.open_object_section("health"); jf.open_object_section("quorum_health"); - for (auto& p : quorum_checks) { - string s = string("mon.") + stringify(p.first); - jf.dump_object(s.c_str(), p.second); + for (auto& [rank, checks] : quorum_checks.get_map()) { + string s = string("mon.") + stringify(rank); + jf.dump_object(s.c_str(), checks); } jf.close_section(); - jf.dump_object("leader_health", leader_checks); + jf.dump_object("leader_health", leader_checks.get_map()); jf.close_section(); jf.flush(*_dout); *_dout << dendl; @@ -147,10 +150,10 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) put_last_committed(t, version); bufferlist qbl; - encode(quorum_checks, qbl); + encode(quorum_checks.get_pending_map(), qbl); t->put(service_name, "quorum", qbl); bufferlist lbl; - encode(leader_checks, lbl); + encode(leader_checks.get_pending_map(), lbl); t->put(service_name, "leader", lbl); { bufferlist bl; @@ -162,11 +165,11 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) // combine per-mon details carefully... map> names; // code -> - for (auto p : quorum_checks) { - for (auto q : p.second.checks) { - names[q.first].insert(mon.monmap->get_name(p.first)); + for (auto& [rank, check_map] : quorum_checks.get_pending_map()) { + for (auto q : check_map.checks) { + names[q.first].insert(mon.monmap->get_name(rank)); } - pending_health.merge(p.second); + pending_health.merge(check_map); } for (auto &p : pending_health.checks) { p.second.summary = std::regex_replace( @@ -186,7 +189,10 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) names[p.first].size() > 1 ? "are" : "is"); } - pending_health.merge(leader_checks); + /* populate leader_health health checks */ + check_leader_health(); + + pending_health.merge(leader_checks.get_pending_map()); } version_t HealthMonitor::get_trim_to() const @@ -370,7 +376,9 @@ bool HealthMonitor::prepare_health_checks(MonOpRequestRef op) { auto m = op->get_req(); // no need to check if it's changed, the peon has done so - quorum_checks[m->get_source().num()] = std::move(m->health_checks); + auto rank = m->get_source().num(); + auto& pending = quorum_checks.get_pending_map_writeable(); + pending[rank] = std::move(m->health_checks); return true; } @@ -684,20 +692,24 @@ bool HealthMonitor::check_member_health() d.detail.push_back(ds.str()); } - auto p = quorum_checks.find(mon.rank); - if (p == quorum_checks.end()) { - if (next.empty()) { - return false; - } - } else { - if (p->second == next) { - return false; + { + auto& current_quorum_checks = quorum_checks.get_map(); + auto p = current_quorum_checks.find(mon.rank); + if (p == current_quorum_checks.end()) { + if (next.empty()) { + return false; + } + } else { + if (p->second == next) { + return false; + } } } if (mon.is_leader()) { // prepare to propose - quorum_checks[mon.rank] = next; + auto& pending_quorum_checks = quorum_checks.get_pending_map_writeable(); + pending_quorum_checks[mon.rank] = next; changed = true; } else { // tell the leader @@ -715,10 +727,11 @@ bool HealthMonitor::check_leader_health() // prune quorum_health { auto& qset = mon.get_quorum(); - auto p = quorum_checks.begin(); - while (p != quorum_checks.end()) { + auto& pending_quorum_checks = quorum_checks.get_pending_map_writeable(); + auto p = pending_quorum_checks.begin(); + while (p != pending_quorum_checks.end()) { if (qset.count(p->first) == 0) { - p = quorum_checks.erase(p); + p = pending_quorum_checks.erase(p); changed = true; } else { ++p; @@ -726,29 +739,29 @@ bool HealthMonitor::check_leader_health() } } - health_check_map_t next; + auto& pending = leader_checks.get_pending_map_writeable(); + pending.clear(); /* start over */ // DAEMON_OLD_VERSION if (g_conf().get_val("mon_warn_on_older_version")) { - check_for_older_version(&next); + check_for_older_version(&pending); } std::set mons_down; // MON_DOWN - check_for_mon_down(&next, mons_down); + check_for_mon_down(&pending, mons_down); // MON_NETSPLIT - check_netsplit(&next, mons_down); + check_netsplit(&pending, mons_down); // MON_CLOCK_SKEW - check_for_clock_skew(&next); + check_for_clock_skew(&pending); // MON_MSGR2_NOT_ENABLED if (g_conf().get_val("mon_warn_on_msgr2_not_enabled")) { - check_if_msgr2_enabled(&next); + check_if_msgr2_enabled(&pending); } // STRETCH MODE - check_mon_crush_loc_stretch_mode(&next); + check_mon_crush_loc_stretch_mode(&pending); - if (next != leader_checks) { + if (pending != leader_checks.get_map()) { changed = true; - leader_checks = next; } return changed; } diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h index 27651232340..0150d9ec3f2 100644 --- a/src/mon/HealthMonitor.h +++ b/src/mon/HealthMonitor.h @@ -19,12 +19,14 @@ #include #include "mon/PaxosService.h" +#include "mon/PaxosMap.h" +#include "mon/health_check.h" class HealthMonitor : public PaxosService { version_t version = 0; - std::map quorum_checks; // for each quorum member - health_check_map_t leader_checks; // leader only + PaxosMap > quorum_checks; // for each quorum member + PaxosMap leader_checks; // leader only std::map mutes; std::map pending_mutes; -- 2.39.5