]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon/HealthMonitor: refactor quorum_checks/leader_checks as PaxosMap
authorPatrick Donnelly <pdonnell@ibm.com>
Tue, 24 Jun 2025 16:21:55 +0000 (12:21 -0400)
committerPatrick Donnelly <pdonnell@ibm.com>
Wed, 1 Oct 2025 18:47:11 +0000 (14:47 -0400)
To codify protocol and catch bugs.

Signed-off-by: Patrick Donnelly <pdonnell@ibm.com>
src/mon/HealthMonitor.cc
src/mon/HealthMonitor.h

index 0c83b3c96f77380dd202e5c1009bd4cae7de9feb..9e058a2884727300fe706932c106b37809a4cd40 100644 (file)
@@ -72,7 +72,10 @@ static ostream& _prefix(std::ostream *_dout, const Monitor &mon,
 }
 
 HealthMonitor::HealthMonitor(Monitor &m, Paxos &p, const string& service_name)
-  : PaxosService(m, p, service_name) {
+  : PaxosService(m, p, service_name)
+  , quorum_checks(m, *this)
+  , leader_checks(m, *this)
+{
 }
 
 void HealthMonitor::init()
@@ -94,7 +97,7 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap)
   mon.store->get(service_name, "quorum", qbl);
   if (qbl.length()) {
     auto p = qbl.cbegin();
-    decode(quorum_checks, p);
+    quorum_checks.decode(p);
   } else {
     quorum_checks.clear();
   }
@@ -103,7 +106,7 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap)
   mon.store->get(service_name, "leader", lbl);
   if (lbl.length()) {
     auto p = lbl.cbegin();
-    decode(leader_checks, p);
+    leader_checks.decode(p);
   } else {
     leader_checks.clear();
   }
@@ -123,12 +126,12 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap)
   JSONFormatter jf(true);
   jf.open_object_section("health");
   jf.open_object_section("quorum_health");
-  for (auto& p : quorum_checks) {
-    string s = string("mon.") + stringify(p.first);
-    jf.dump_object(s.c_str(), p.second);
+  for (auto& [rank, checks] : quorum_checks.get_map()) {
+    string s = string("mon.") + stringify(rank);
+    jf.dump_object(s.c_str(), checks);
   }
   jf.close_section();
-  jf.dump_object("leader_health", leader_checks);
+  jf.dump_object("leader_health", leader_checks.get_map());
   jf.close_section();
   jf.flush(*_dout);
   *_dout << dendl;
@@ -147,10 +150,10 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   put_last_committed(t, version);
 
   bufferlist qbl;
-  encode(quorum_checks, qbl);
+  encode(quorum_checks.get_pending_map(), qbl);
   t->put(service_name, "quorum", qbl);
   bufferlist lbl;
-  encode(leader_checks, lbl);
+  encode(leader_checks.get_pending_map(), lbl);
   t->put(service_name, "leader", lbl);
   {
     bufferlist bl;
@@ -162,11 +165,11 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 
   // combine per-mon details carefully...
   map<string,set<string>> names; // code -> <mon names>
-  for (auto p : quorum_checks) {
-    for (auto q : p.second.checks) {
-      names[q.first].insert(mon.monmap->get_name(p.first));
+  for (auto& [rank, check_map] : quorum_checks.get_pending_map()) {
+    for (auto q : check_map.checks) {
+      names[q.first].insert(mon.monmap->get_name(rank));
     }
-    pending_health.merge(p.second);
+    pending_health.merge(check_map);
   }
   for (auto &p : pending_health.checks) {
     p.second.summary = std::regex_replace(
@@ -186,7 +189,10 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
       names[p.first].size() > 1 ? "are" : "is");
   }
 
-  pending_health.merge(leader_checks);
+  /* populate leader_health health checks */
+  check_leader_health();
+
+  pending_health.merge(leader_checks.get_pending_map());
 }
 
 version_t HealthMonitor::get_trim_to() const
@@ -370,7 +376,9 @@ bool HealthMonitor::prepare_health_checks(MonOpRequestRef op)
 {
   auto m = op->get_req<MMonHealthChecks>();
   // no need to check if it's changed, the peon has done so
-  quorum_checks[m->get_source().num()] = std::move(m->health_checks);
+  auto rank = m->get_source().num();
+  auto& pending = quorum_checks.get_pending_map_writeable();
+  pending[rank] = std::move(m->health_checks);
   return true;
 }
 
@@ -684,20 +692,24 @@ bool HealthMonitor::check_member_health()
     d.detail.push_back(ds.str());
   }
 
-  auto p = quorum_checks.find(mon.rank);
-  if (p == quorum_checks.end()) {
-    if (next.empty()) {
-      return false;
-    }
-  } else {
-    if (p->second == next) {
-      return false;
+  {
+    auto& current_quorum_checks = quorum_checks.get_map();
+    auto p = current_quorum_checks.find(mon.rank);
+    if (p == current_quorum_checks.end()) {
+      if (next.empty()) {
+        return false;
+      }
+    } else {
+      if (p->second == next) {
+        return false;
+      }
     }
   }
 
   if (mon.is_leader()) {
     // prepare to propose
-    quorum_checks[mon.rank] = next;
+    auto& pending_quorum_checks = quorum_checks.get_pending_map_writeable();
+    pending_quorum_checks[mon.rank] = next;
     changed = true;
   } else {
     // tell the leader
@@ -715,10 +727,11 @@ bool HealthMonitor::check_leader_health()
   // prune quorum_health
   {
     auto& qset = mon.get_quorum();
-    auto p = quorum_checks.begin();
-    while (p != quorum_checks.end()) {
+    auto& pending_quorum_checks = quorum_checks.get_pending_map_writeable();
+    auto p = pending_quorum_checks.begin();
+    while (p != pending_quorum_checks.end()) {
       if (qset.count(p->first) == 0) {
-       p = quorum_checks.erase(p);
+       p = pending_quorum_checks.erase(p);
        changed = true;
       } else {
        ++p;
@@ -726,29 +739,29 @@ bool HealthMonitor::check_leader_health()
     }
   }
 
-  health_check_map_t next;
+  auto& pending = leader_checks.get_pending_map_writeable();
+  pending.clear(); /* start over */
 
  // DAEMON_OLD_VERSION
   if (g_conf().get_val<bool>("mon_warn_on_older_version")) {
-    check_for_older_version(&next);
+    check_for_older_version(&pending);
   }
   std::set<std::string> mons_down;
   // MON_DOWN
-  check_for_mon_down(&next, mons_down);
+  check_for_mon_down(&pending, mons_down);
   // MON_NETSPLIT
-  check_netsplit(&next, mons_down);
+  check_netsplit(&pending, mons_down);
   // MON_CLOCK_SKEW
-  check_for_clock_skew(&next);
+  check_for_clock_skew(&pending);
   // MON_MSGR2_NOT_ENABLED
   if (g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled")) {
-    check_if_msgr2_enabled(&next);
+    check_if_msgr2_enabled(&pending);
   }
   // STRETCH MODE
-  check_mon_crush_loc_stretch_mode(&next);
+  check_mon_crush_loc_stretch_mode(&pending);
 
-  if (next != leader_checks) {
+  if (pending != leader_checks.get_map()) {
     changed = true;
-    leader_checks = next;
   }
   return changed;
 }
index 276512323406ebf48e7af78238b2dc593523cd16..0150d9ec3f2676eaefb160b2e990d9a6930fb450 100644 (file)
 #include <string>
 
 #include "mon/PaxosService.h"
+#include "mon/PaxosMap.h"
+#include "mon/health_check.h"
 
 class HealthMonitor : public PaxosService
 {
   version_t version = 0;
-  std::map<int,health_check_map_t> quorum_checks;  // for each quorum member
-  health_check_map_t leader_checks;           // leader only
+  PaxosMap<Monitor, HealthMonitor, std::map<int,health_check_map_t> > quorum_checks;  // for each quorum member
+  PaxosMap<Monitor, HealthMonitor, health_check_map_t> leader_checks;           // leader only
   std::map<std::string,health_mute_t> mutes;
 
   std::map<std::string,health_mute_t> pending_mutes;