]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon: refactor health check map through PaxosMap
authorPatrick Donnelly <pdonnell@ibm.com>
Sun, 1 Jun 2025 00:53:55 +0000 (20:53 -0400)
committerPatrick Donnelly <pdonnell@ibm.com>
Mon, 29 Dec 2025 22:30:14 +0000 (17:30 -0500)
This was motivated by confusing persistence of some health warnings during
testing of health warnings for cephx upgrades. Some services are only doing
health checks during ::encode_pending and others during ::tick. Make it
consistent.

Signed-off-by: Patrick Donnelly <pdonnell@ibm.com>
src/mon/AuthMonitor.cc
src/mon/HealthMonitor.cc
src/mon/MDSMonitor.cc
src/mon/MgrMonitor.cc
src/mon/MgrStatMonitor.cc
src/mon/MonmapMonitor.cc
src/mon/NVMeofGwMon.cc
src/mon/OSDMonitor.cc
src/mon/PaxosService.cc
src/mon/PaxosService.h
src/mon/health_check.h

index c10a46ef2b64434df0ab806d55190afb778aaab8..5e245f50bcedad3758ba9839fcb20cbfbe6ca6ef 100644 (file)
@@ -324,7 +324,6 @@ void AuthMonitor::create_initial()
 void AuthMonitor::update_from_paxos(bool *need_bootstrap)
 {
   dout(10) << __func__ << dendl;
-  load_health();
 
   version_t version = get_last_committed();
   version_t keys_ver = mon.key_server.get_ver();
@@ -469,7 +468,7 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   put_last_committed(t, version);
 
   // health
-  health_check_map_t next;
+  auto& next = get_health_checks_pending_writeable();
   map<string,list<string>> bad_detail;  // entity -> details
   for (auto i = mon.key_server.secrets_begin();
        i != mon.key_server.secrets_end();
@@ -513,7 +512,6 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
       }
     }
   }
-  encode_health(next, t);
 }
 
 void AuthMonitor::encode_full(MonitorDBStore::TransactionRef t)
index 000aafb25115651bc3792d9fac559e552a9fe8eb..b52ee9d64ea44f034fb22270255b9a4d99ed4b40 100644 (file)
@@ -93,7 +93,6 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap)
 {
   version = get_last_committed();
   dout(10) << __func__ << dendl;
-  load_health();
 
   bufferlist qbl;
   mon.store->get(service_name, "quorum", qbl);
@@ -163,7 +162,7 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     t->put(service_name, "mutes", bl);
   }
 
-  health_check_map_t pending_health;
+  auto& pending_health = get_health_checks_pending_writeable();
 
   // combine per-mon details carefully...
   map<string,set<string>> names; // code -> <mon names>
@@ -192,7 +191,6 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   }
 
   pending_health.merge(leader_checks);
-  encode_health(pending_health, t);
 }
 
 version_t HealthMonitor::get_trim_to() const
index 272e2aada1dae085e8db82765c2f5adb604f4c79..8a171587d29d5296ab4dd90a1dae58ad54498ad3 100644 (file)
@@ -137,7 +137,6 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap)
   ceph_assert(version > get_fsmap().get_epoch());
 
   load_metadata(pending_metadata);
-  load_health();
 
   // read and decode
   bufferlist fsmap_bl;
@@ -270,7 +269,7 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   remove_from_metadata(pending, t);
 
   // health
-  health_check_map_t new_checks;
+  auto& new_checks = get_health_checks_pending_writeable();
   const auto &info_map = pending.get_mds_info();
   for (const auto &i : info_map) {
     const auto &gid = i.first;
@@ -337,7 +336,6 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
       std::regex("%hasorhave%"),
       p.second.detail.size() > 1 ? "have" : "has");
   }
-  encode_health(new_checks, t);
 }
 
 version_t MDSMonitor::get_trim_to() const
index e387212ac0917decae6ba562fff8abd6ae4850cf..22d224c17d31b1327e25d3971572b9def736b5ff 100644 (file)
@@ -186,7 +186,6 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap)
 
     ever_had_active_mgr = get_value("ever_had_active_mgr");
 
-    load_health();
 
     if (map.available) {
       first_seen_inactive = utime_t();
@@ -349,7 +348,7 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   pending_metadata.clear();
   pending_metadata_rm.clear();
 
-  health_check_map_t next;
+  auto& next = get_health_checks_pending_writeable();
   if (pending_map.active_gid == 0) {
     auto level = should_warn_about_mgr_down();
     if (level != HEALTH_OK) {
@@ -361,7 +360,6 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   } else {
     put_value(t, "ever_had_active_mgr", 1);
   }
-  encode_health(next, t);
 
   if (pending_command_descs.size()) {
     dout(4) << __func__ << " encoding " << pending_command_descs.size()
index bf5b34677cbff3a7f6acf165f249193039df493f..ff1f32208b21385433b8f99a56fadbb3e100630e 100644 (file)
@@ -232,7 +232,6 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
 {
   version = get_last_committed();
   dout(10) << " " << version << dendl;
-  load_health();
   bufferlist bl;
   get_version(version, bl);
   if (version) {
@@ -314,7 +313,6 @@ void MgrStatMonitor::create_pending()
 {
   dout(10) << " " << version << dendl;
   pending_digest = digest;
-  pending_health_checks = get_health_checks();
   pending_service_map_bl.clear();
   encode(service_map, pending_service_map_bl, mon.get_quorum_con_features());
 }
@@ -331,8 +329,6 @@ void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   encode(pending_pool_availability, bl);
   put_version(t, version, bl);
   put_last_committed(t, version);
-
-  encode_health(pending_health_checks, t);
 }
 
 version_t MgrStatMonitor::get_trim_to() const
@@ -402,7 +398,7 @@ bool MgrStatMonitor::prepare_report(MonOpRequestRef op)
   bufferlist bl = m->get_data();
   auto p = bl.cbegin();
   decode(pending_digest, p);
-  pending_health_checks.swap(m->health_checks);
+  auto& pending_health_checks = get_health_checks_pending_writeable();
   if (m->service_map_bl.length()) {
     pending_service_map_bl.swap(m->service_map_bl);
   }
index fa59c53273ba03e8e058080a044dfcb4957316eb..a9a92d69710a8eb42a5e74cce1c14baa13b3ca6d 100644 (file)
@@ -158,9 +158,8 @@ void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   }
 
   //health
-  health_check_map_t next;
+  auto& next = get_health_checks_pending_writeable();
   pending_map.check_health(&next);
-  encode_health(next, t);
 }
 
 class C_ApplyFeatures : public Context {
index f8e6f3c488b8fd18ac264ba11b1be3208bd0058c..02944596eff656d4d959bf89c36aa31f2864240b 100644 (file)
@@ -239,9 +239,8 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t)
   put_last_committed(t, pending_map.epoch);
 
   //health
-  health_check_map_t checks;
+  auto& checks = get_health_checks_pending_writeable();
   pending_map.get_health_checks(&checks);
-  encode_health(checks, t);
 }
 
 void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
@@ -254,7 +253,6 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
     bufferlist bl;
     int err = get_version(version, bl);
     ceph_assert(err == 0);
-    load_health();
 
     auto p = bl.cbegin();
     map.decode(p);
index c71a66c853abda66b2564ec8ae97e7f587b45e7a..1e6607038aac33e8cf3e845476c4f13e56053fd5 100644 (file)
@@ -744,7 +744,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
     mapping_job.reset();
   }
 
-  load_health();
 
   /*
    * We will possibly have a stashed latest that *we* wrote, and we will
@@ -2075,9 +2074,8 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   }
 
   // health
-  health_check_map_t next;
+  auto& next = get_health_checks_pending_writeable();
   tmp.check_health(cct, &next);
-  encode_health(next, t);
 }
 
 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
index 6e4da4cd4f0f6674bd6314df95b42ddc488ec1d9..55b08a0b66d47354cac4d039f014d7420d82a695 100644 (file)
@@ -163,7 +163,7 @@ void PaxosService::refresh(bool *need_bootstrap)
   format_version = new_format;
 
 
-  update_from_paxos(need_bootstrap);
+  _update_from_paxos(need_bootstrap);
 }
 
 void PaxosService::post_refresh()
@@ -223,7 +223,7 @@ void PaxosService::propose_pending()
   if (should_stash_full())
     encode_full(t);
 
-  encode_pending(t);
+  _encode_pending(t);
   have_pending = false;
 
   if (format_version > 0) {
@@ -340,7 +340,7 @@ void PaxosService::_active()
   if (mon.is_leader()) {
     dout(7) << __func__ << " creating new pending" << dendl;
     if (!have_pending) {
-      create_pending();
+      _create_pending();
       have_pending = true;
     }
 
@@ -466,13 +466,34 @@ void PaxosService::trim(MonitorDBStore::TransactionRef t,
   }
 }
 
-void PaxosService::load_health()
+void PaxosService::_create_pending() {
+  dout(10) << __func__ << dendl;
+  health_checks.create_pending();
+  dout(30) << __func__ << ": health_checks pending: " << health_checks << dendl;
+  create_pending();
+}
+
+void PaxosService::_encode_pending(MonitorDBStore::TransactionRef t) {
+  dout(10) << __func__ << dendl;
+  using ceph::encode;
+  encode_pending(t);
+  dout(30) << __func__ << ": health_checks encoding: " << health_checks << dendl;
+  auto const& pending = health_checks.get_pending_map();
+  ceph::buffer::list bl;
+  encode(pending, bl);
+  t->put("health", service_name, bl);
+  mon.log_health(pending, health_checks.get_map(), t);
+}
+
+void PaxosService::_update_from_paxos(bool* need_bootstrap)
 {
+  dout(10) << __func__ << dendl;
   bufferlist bl;
   mon.store->get("health", service_name, bl);
   if (bl.length()) {
     auto p = bl.cbegin();
-    using ceph::decode;
-    decode(health_checks, p);
+    health_checks.decode(p);
+    dout(30) << __func__ << ":  health_checks decoded: " << health_checks << dendl;
   }
+  update_from_paxos(need_bootstrap);
 }
index 53b28f6d357d5c9a59de8dbc47d6b8eaa2155e69..ac69f15c458f9462df1d72317d71b27b6e95b5b0 100644 (file)
@@ -20,6 +20,7 @@
 #include "Paxos.h"
 #include "Monitor.h"
 #include "MonitorDBStore.h"
+#include "PaxosMap.h"
 
 /**
  * A Paxos Service is an abstraction that easily allows one to obtain an
@@ -49,7 +50,7 @@ class PaxosService {
    * If we are or have queued anything for proposal, this variable will be true
    * until our proposal has been finished.
    */
-  bool proposing;
+  bool proposing = false;
 
   bool need_immediate_propose = false;
 
@@ -61,35 +62,44 @@ protected:
    * must keep its own version, if so they wish. This variable should be used
    * for that purpose.
    */
-  version_t service_version;
+  version_t service_version = 0;
 
- private:
+private:
   /**
    * Event callback responsible for proposing our pending value once a timer 
    * runs out and fires.
    */
-  Context *proposal_timer;
+  Context *proposal_timer = nullptr;
   /**
    * If the implementation class has anything pending to be proposed to Paxos,
    * then have_pending should be true; otherwise, false.
    */
-  bool have_pending; 
+  bool have_pending = false;
+
+protected:
+  /**
+   * format of our state in RocksDB, 0 for default
+   */
+  version_t format_version = 0;
 
   /**
    * health checks for this service
-   *
-   * Child must populate this during encode_pending() by calling encode_health().
    */
-  health_check_map_t health_checks;
-protected:
+  PaxosMap<Monitor, PaxosService, health_check_map_t> health_checks;
+
   /**
-   * format of our state in RocksDB, 0 for default
+   * The pending health check map. Only callable by the service itself.
    */
-  version_t format_version;
+  health_check_map_t& get_health_checks_pending_writeable() {
+    return health_checks.get_pending_map_writeable();
+  }
 
 public:
-  const health_check_map_t& get_health_checks() const {
-    return health_checks;
+  /**
+   * The current (i.e. not pending) health check map.
+   */
+  health_check_map_t const& get_health_checks() const {
+    return health_checks.get_map();
   }
 
   /**
@@ -146,14 +156,7 @@ public:
    * @param name Our service's name.
    */
   PaxosService(Monitor &mn, Paxos &p, std::string name) 
-    : mon(mn), paxos(p), service_name(name),
-      proposing(false),
-      service_version(0), proposal_timer(0), have_pending(false),
-      format_version(0),
-      last_committed_name("last_committed"),
-      first_committed_name("first_committed"),
-      full_prefix_name("full"), full_latest_name("latest"),
-      cached_first_committed(0), cached_last_committed(0)
+    : mon(mn), paxos(p), service_name(std::move(name)), health_checks(mn, *this)
   {
   }
 
@@ -213,6 +216,12 @@ private:
    */
   void _active();
 
+  void _create_pending();
+
+  void _encode_pending(MonitorDBStore::TransactionRef t);
+
+  void _update_from_paxos(bool* need_bootstrap);
+
 public:
   /**
    * Propose a new value through Paxos.
@@ -436,16 +445,6 @@ public:
    */
   virtual void tick() {}
 
-  void encode_health(const health_check_map_t& next,
-                    MonitorDBStore::TransactionRef t) {
-    using ceph::encode;
-    ceph::buffer::list bl;
-    encode(next, bl);
-    t->put("health", service_name, bl);
-    mon.log_health(next, health_checks, t);
-  }
-  void load_health();
-
   /**
    * @defgroup PaxosService_h_store_keys Set of keys that are usually used on
    *                                    all the services implementing this
@@ -454,10 +453,10 @@ public:
    *                                    mistakes.
    * @{
    */
-  const std::string last_committed_name;
-  const std::string first_committed_name;
-  const std::string full_prefix_name;
-  const std::string full_latest_name;
+  const std::string last_committed_name{"last_committed"};
+  const std::string first_committed_name{"first_committed"};
+  const std::string full_prefix_name{"full"};
+  const std::string full_latest_name{"latest"};
   /**
    * @}
    */
@@ -473,8 +472,8 @@ public:
    *                                        and avoid the overhead.
    * @{
    */
-  version_t cached_first_committed;
-  version_t cached_last_committed;
+  version_t cached_first_committed = 0;
+  version_t cached_last_committed = 0;
   /**
    * @}
    */
index 03877acc799c0215d607e880d6d2415fc2adbe5c..35f234e98244bd4f3e7090979d0731e1ce4b194a 100644 (file)
@@ -3,12 +3,15 @@
 
 #pragma once
 
-#include <string>
+#include <iosfwd>
 #include <map>
+#include <string>
 
 #include "include/health.h"
 #include "include/utime.h"
+#include "common/CanHasPrint.h"
 #include "common/Formatter.h"
+#include "common/dout.h"
 
 struct health_check_t {
   health_status_t severity;
@@ -39,6 +42,13 @@ struct health_check_t {
     return !(l == r);
   }
 
+  void print(std::ostream& os) const {
+    os << "hc"
+       << "(" << summary
+       << " count=" << count
+       << ")";
+  }
+
   void dump(ceph::Formatter *f, bool want_detail=true) const {
     f->dump_stream("severity") << severity;
 
@@ -123,6 +133,17 @@ struct health_check_map_t {
     DENC_FINISH(p);
   }
 
+  static constexpr bool is_ephemeral() {
+    return true;
+  }
+
+  void print(std::ostream& os) const {
+    os << "health_check_map_t"
+       << "("
+       << checks
+       << ")";
+  }
+
   void dump(ceph::Formatter *f) const {
     for (auto& [code, check] : checks) {
       f->dump_object(code, check);