From: Patrick Donnelly Date: Sun, 1 Jun 2025 00:53:55 +0000 (-0400) Subject: mon: refactor health check map through PaxosMap X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=83bf19ec8a3757fc6256636e116ed897b8e3447c;p=ceph-ci.git mon: refactor health check map through PaxosMap This was motivated by confusing persistence of some health warnings during testing of health warnings for cephx upgrades. Some services are only doing health checks during ::encode_pending and others during ::tick. Make it consistent. Signed-off-by: Patrick Donnelly --- diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index c10a46ef2b6..5e245f50bce 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -324,7 +324,6 @@ void AuthMonitor::create_initial() void AuthMonitor::update_from_paxos(bool *need_bootstrap) { dout(10) << __func__ << dendl; - load_health(); version_t version = get_last_committed(); version_t keys_ver = mon.key_server.get_ver(); @@ -469,7 +468,7 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t) put_last_committed(t, version); // health - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); map> bad_detail; // entity -> details for (auto i = mon.key_server.secrets_begin(); i != mon.key_server.secrets_end(); @@ -513,7 +512,6 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t) } } } - encode_health(next, t); } void AuthMonitor::encode_full(MonitorDBStore::TransactionRef t) diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 000aafb2511..b52ee9d64ea 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -93,7 +93,6 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap) { version = get_last_committed(); dout(10) << __func__ << dendl; - load_health(); bufferlist qbl; mon.store->get(service_name, "quorum", qbl); @@ -163,7 +162,7 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) t->put(service_name, "mutes", bl); } - health_check_map_t pending_health; + auto& pending_health = get_health_checks_pending_writeable(); // combine per-mon details carefully... map> names; // code -> @@ -192,7 +191,6 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) } pending_health.merge(leader_checks); - encode_health(pending_health, t); } version_t HealthMonitor::get_trim_to() const diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 272e2aada1d..8a171587d29 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -137,7 +137,6 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap) ceph_assert(version > get_fsmap().get_epoch()); load_metadata(pending_metadata); - load_health(); // read and decode bufferlist fsmap_bl; @@ -270,7 +269,7 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t) remove_from_metadata(pending, t); // health - health_check_map_t new_checks; + auto& new_checks = get_health_checks_pending_writeable(); const auto &info_map = pending.get_mds_info(); for (const auto &i : info_map) { const auto &gid = i.first; @@ -337,7 +336,6 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t) std::regex("%hasorhave%"), p.second.detail.size() > 1 ? "have" : "has"); } - encode_health(new_checks, t); } version_t MDSMonitor::get_trim_to() const diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index e387212ac09..22d224c17d3 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -186,7 +186,6 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap) ever_had_active_mgr = get_value("ever_had_active_mgr"); - load_health(); if (map.available) { first_seen_inactive = utime_t(); @@ -349,7 +348,7 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) pending_metadata.clear(); pending_metadata_rm.clear(); - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); if (pending_map.active_gid == 0) { auto level = should_warn_about_mgr_down(); if (level != HEALTH_OK) { @@ -361,7 +360,6 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) } else { put_value(t, "ever_had_active_mgr", 1); } - encode_health(next, t); if (pending_command_descs.size()) { dout(4) << __func__ << " encoding " << pending_command_descs.size() diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index bf5b34677cb..ff1f32208b2 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -232,7 +232,6 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap) { version = get_last_committed(); dout(10) << " " << version << dendl; - load_health(); bufferlist bl; get_version(version, bl); if (version) { @@ -314,7 +313,6 @@ void MgrStatMonitor::create_pending() { dout(10) << " " << version << dendl; pending_digest = digest; - pending_health_checks = get_health_checks(); pending_service_map_bl.clear(); encode(service_map, pending_service_map_bl, mon.get_quorum_con_features()); } @@ -331,8 +329,6 @@ void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t) encode(pending_pool_availability, bl); put_version(t, version, bl); put_last_committed(t, version); - - encode_health(pending_health_checks, t); } version_t MgrStatMonitor::get_trim_to() const @@ -402,7 +398,7 @@ bool MgrStatMonitor::prepare_report(MonOpRequestRef op) bufferlist bl = m->get_data(); auto p = bl.cbegin(); decode(pending_digest, p); - pending_health_checks.swap(m->health_checks); + auto& pending_health_checks = get_health_checks_pending_writeable(); if (m->service_map_bl.length()) { pending_service_map_bl.swap(m->service_map_bl); } diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc index fa59c53273b..a9a92d69710 100644 --- a/src/mon/MonmapMonitor.cc +++ b/src/mon/MonmapMonitor.cc @@ -158,9 +158,8 @@ void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t) } //health - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); pending_map.check_health(&next); - encode_health(next, t); } class C_ApplyFeatures : public Context { diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index f8e6f3c488b..02944596eff 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -239,9 +239,8 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t) put_last_committed(t, pending_map.epoch); //health - health_check_map_t checks; + auto& checks = get_health_checks_pending_writeable(); pending_map.get_health_checks(&checks); - encode_health(checks, t); } void NVMeofGwMon::update_from_paxos(bool *need_bootstrap) @@ -254,7 +253,6 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap) bufferlist bl; int err = get_version(version, bl); ceph_assert(err == 0); - load_health(); auto p = bl.cbegin(); map.decode(p); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c71a66c853a..1e6607038aa 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -744,7 +744,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) mapping_job.reset(); } - load_health(); /* * We will possibly have a stashed latest that *we* wrote, and we will @@ -2075,9 +2074,8 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) } // health - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); tmp.check_health(cct, &next); - encode_health(next, t); } int OSDMonitor::load_metadata(int osd, map& m, ostream *err) diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc index 6e4da4cd4f0..55b08a0b66d 100644 --- a/src/mon/PaxosService.cc +++ b/src/mon/PaxosService.cc @@ -163,7 +163,7 @@ void PaxosService::refresh(bool *need_bootstrap) format_version = new_format; - update_from_paxos(need_bootstrap); + _update_from_paxos(need_bootstrap); } void PaxosService::post_refresh() @@ -223,7 +223,7 @@ void PaxosService::propose_pending() if (should_stash_full()) encode_full(t); - encode_pending(t); + _encode_pending(t); have_pending = false; if (format_version > 0) { @@ -340,7 +340,7 @@ void PaxosService::_active() if (mon.is_leader()) { dout(7) << __func__ << " creating new pending" << dendl; if (!have_pending) { - create_pending(); + _create_pending(); have_pending = true; } @@ -466,13 +466,34 @@ void PaxosService::trim(MonitorDBStore::TransactionRef t, } } -void PaxosService::load_health() +void PaxosService::_create_pending() { + dout(10) << __func__ << dendl; + health_checks.create_pending(); + dout(30) << __func__ << ": health_checks pending: " << health_checks << dendl; + create_pending(); +} + +void PaxosService::_encode_pending(MonitorDBStore::TransactionRef t) { + dout(10) << __func__ << dendl; + using ceph::encode; + encode_pending(t); + dout(30) << __func__ << ": health_checks encoding: " << health_checks << dendl; + auto const& pending = health_checks.get_pending_map(); + ceph::buffer::list bl; + encode(pending, bl); + t->put("health", service_name, bl); + mon.log_health(pending, health_checks.get_map(), t); +} + +void PaxosService::_update_from_paxos(bool* need_bootstrap) { + dout(10) << __func__ << dendl; bufferlist bl; mon.store->get("health", service_name, bl); if (bl.length()) { auto p = bl.cbegin(); - using ceph::decode; - decode(health_checks, p); + health_checks.decode(p); + dout(30) << __func__ << ": health_checks decoded: " << health_checks << dendl; } + update_from_paxos(need_bootstrap); } diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index 53b28f6d357..ac69f15c458 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -20,6 +20,7 @@ #include "Paxos.h" #include "Monitor.h" #include "MonitorDBStore.h" +#include "PaxosMap.h" /** * A Paxos Service is an abstraction that easily allows one to obtain an @@ -49,7 +50,7 @@ class PaxosService { * If we are or have queued anything for proposal, this variable will be true * until our proposal has been finished. */ - bool proposing; + bool proposing = false; bool need_immediate_propose = false; @@ -61,35 +62,44 @@ protected: * must keep its own version, if so they wish. This variable should be used * for that purpose. */ - version_t service_version; + version_t service_version = 0; - private: +private: /** * Event callback responsible for proposing our pending value once a timer * runs out and fires. */ - Context *proposal_timer; + Context *proposal_timer = nullptr; /** * If the implementation class has anything pending to be proposed to Paxos, * then have_pending should be true; otherwise, false. */ - bool have_pending; + bool have_pending = false; + +protected: + /** + * format of our state in RocksDB, 0 for default + */ + version_t format_version = 0; /** * health checks for this service - * - * Child must populate this during encode_pending() by calling encode_health(). */ - health_check_map_t health_checks; -protected: + PaxosMap health_checks; + /** - * format of our state in RocksDB, 0 for default + * The pending health check map. Only callable by the service itself. */ - version_t format_version; + health_check_map_t& get_health_checks_pending_writeable() { + return health_checks.get_pending_map_writeable(); + } public: - const health_check_map_t& get_health_checks() const { - return health_checks; + /** + * The current (i.e. not pending) health check map. + */ + health_check_map_t const& get_health_checks() const { + return health_checks.get_map(); } /** @@ -146,14 +156,7 @@ public: * @param name Our service's name. */ PaxosService(Monitor &mn, Paxos &p, std::string name) - : mon(mn), paxos(p), service_name(name), - proposing(false), - service_version(0), proposal_timer(0), have_pending(false), - format_version(0), - last_committed_name("last_committed"), - first_committed_name("first_committed"), - full_prefix_name("full"), full_latest_name("latest"), - cached_first_committed(0), cached_last_committed(0) + : mon(mn), paxos(p), service_name(std::move(name)), health_checks(mn, *this) { } @@ -213,6 +216,12 @@ private: */ void _active(); + void _create_pending(); + + void _encode_pending(MonitorDBStore::TransactionRef t); + + void _update_from_paxos(bool* need_bootstrap); + public: /** * Propose a new value through Paxos. @@ -436,16 +445,6 @@ public: */ virtual void tick() {} - void encode_health(const health_check_map_t& next, - MonitorDBStore::TransactionRef t) { - using ceph::encode; - ceph::buffer::list bl; - encode(next, bl); - t->put("health", service_name, bl); - mon.log_health(next, health_checks, t); - } - void load_health(); - /** * @defgroup PaxosService_h_store_keys Set of keys that are usually used on * all the services implementing this @@ -454,10 +453,10 @@ public: * mistakes. * @{ */ - const std::string last_committed_name; - const std::string first_committed_name; - const std::string full_prefix_name; - const std::string full_latest_name; + const std::string last_committed_name{"last_committed"}; + const std::string first_committed_name{"first_committed"}; + const std::string full_prefix_name{"full"}; + const std::string full_latest_name{"latest"}; /** * @} */ @@ -473,8 +472,8 @@ public: * and avoid the overhead. * @{ */ - version_t cached_first_committed; - version_t cached_last_committed; + version_t cached_first_committed = 0; + version_t cached_last_committed = 0; /** * @} */ diff --git a/src/mon/health_check.h b/src/mon/health_check.h index 03877acc799..35f234e9824 100644 --- a/src/mon/health_check.h +++ b/src/mon/health_check.h @@ -3,12 +3,15 @@ #pragma once -#include +#include #include +#include #include "include/health.h" #include "include/utime.h" +#include "common/CanHasPrint.h" #include "common/Formatter.h" +#include "common/dout.h" struct health_check_t { health_status_t severity; @@ -39,6 +42,13 @@ struct health_check_t { return !(l == r); } + void print(std::ostream& os) const { + os << "hc" + << "(" << summary + << " count=" << count + << ")"; + } + void dump(ceph::Formatter *f, bool want_detail=true) const { f->dump_stream("severity") << severity; @@ -123,6 +133,17 @@ struct health_check_map_t { DENC_FINISH(p); } + static constexpr bool is_ephemeral() { + return true; + } + + void print(std::ostream& os) const { + os << "health_check_map_t" + << "(" + << checks + << ")"; + } + void dump(ceph::Formatter *f) const { for (auto& [code, check] : checks) { f->dump_object(code, check);