From 496e6df754d9b2d80575c0169964d418cf11a0fe Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Sat, 31 May 2025 20:53:55 -0400 Subject: [PATCH] mon: refactor health check map through PaxosMap This was motivated by confusing persistence of some health warnings during testing of health warnings for cephx upgrades. Some services are only doing health checks during ::encode_pending and others during ::tick. Make it consistent. Signed-off-by: Patrick Donnelly (cherry picked from commit 5e0c2b7aa74e378dadbaccb3584f750485b17e44) --- src/mon/AuthMonitor.cc | 4 +-- src/mon/HealthMonitor.cc | 4 +-- src/mon/MDSMonitor.cc | 4 +-- src/mon/MgrMonitor.cc | 4 +-- src/mon/MgrStatMonitor.cc | 6 +--- src/mon/MonmapMonitor.cc | 3 +- src/mon/NVMeofGwMon.cc | 4 +-- src/mon/OSDMonitor.cc | 4 +-- src/mon/PaxosService.cc | 33 ++++++++++++++---- src/mon/PaxosService.h | 73 +++++++++++++++++++-------------------- src/mon/health_check.h | 23 +++++++++++- 11 files changed, 93 insertions(+), 69 deletions(-) diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index d335e36ebbe..8b98de684ed 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -323,7 +323,6 @@ void AuthMonitor::create_initial() void AuthMonitor::update_from_paxos(bool *need_bootstrap) { dout(10) << __func__ << dendl; - load_health(); version_t version = get_last_committed(); version_t keys_ver = mon.key_server.get_ver(); @@ -468,7 +467,7 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t) put_last_committed(t, version); // health - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); map> bad_detail; // entity -> details for (auto i = mon.key_server.secrets_begin(); i != mon.key_server.secrets_end(); @@ -512,7 +511,6 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t) } } } - encode_health(next, t); } void AuthMonitor::encode_full(MonitorDBStore::TransactionRef t) diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 117e4e050e5..2c7e1ad6001 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -88,7 +88,6 @@ void HealthMonitor::update_from_paxos(bool *need_bootstrap) { version = get_last_committed(); dout(10) << __func__ << dendl; - load_health(); bufferlist qbl; mon.store->get(service_name, "quorum", qbl); @@ -158,7 +157,7 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) t->put(service_name, "mutes", bl); } - health_check_map_t pending_health; + auto& pending_health = get_health_checks_pending_writeable(); // combine per-mon details carefully... map> names; // code -> @@ -187,7 +186,6 @@ void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) } pending_health.merge(leader_checks); - encode_health(pending_health, t); } version_t HealthMonitor::get_trim_to() const diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index c03bd3f7be9..80079bd7060 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -136,7 +136,6 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap) ceph_assert(version > get_fsmap().get_epoch()); load_metadata(pending_metadata); - load_health(); // read and decode bufferlist fsmap_bl; @@ -269,7 +268,7 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t) remove_from_metadata(pending, t); // health - health_check_map_t new_checks; + auto& new_checks = get_health_checks_pending_writeable(); const auto &info_map = pending.get_mds_info(); for (const auto &i : info_map) { const auto &gid = i.first; @@ -336,7 +335,6 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t) std::regex("%hasorhave%"), p.second.detail.size() > 1 ? "have" : "has"); } - encode_health(new_checks, t); } version_t MDSMonitor::get_trim_to() const diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 72b1d4e2745..ce75bc37f5b 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -183,7 +183,6 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap) ever_had_active_mgr = get_value("ever_had_active_mgr"); - load_health(); if (map.available) { first_seen_inactive = utime_t(); @@ -346,7 +345,7 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) pending_metadata.clear(); pending_metadata_rm.clear(); - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); if (pending_map.active_gid == 0) { auto level = should_warn_about_mgr_down(); if (level != HEALTH_OK) { @@ -358,7 +357,6 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) } else { put_value(t, "ever_had_active_mgr", 1); } - encode_health(next, t); if (pending_command_descs.size()) { dout(4) << __func__ << " encoding " << pending_command_descs.size() diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index f91056bf9c6..b85c6c7cc2c 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -231,7 +231,6 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap) { version = get_last_committed(); dout(10) << " " << version << dendl; - load_health(); bufferlist bl; get_version(version, bl); if (version) { @@ -313,7 +312,6 @@ void MgrStatMonitor::create_pending() { dout(10) << " " << version << dendl; pending_digest = digest; - pending_health_checks = get_health_checks(); pending_service_map_bl.clear(); encode(service_map, pending_service_map_bl, mon.get_quorum_con_features()); } @@ -330,8 +328,6 @@ void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t) encode(pending_pool_availability, bl); put_version(t, version, bl); put_last_committed(t, version); - - encode_health(pending_health_checks, t); } version_t MgrStatMonitor::get_trim_to() const @@ -401,7 +397,7 @@ bool MgrStatMonitor::prepare_report(MonOpRequestRef op) bufferlist bl = m->get_data(); auto p = bl.cbegin(); decode(pending_digest, p); - pending_health_checks.swap(m->health_checks); + auto& pending_health_checks = get_health_checks_pending_writeable(); if (m->service_map_bl.length()) { pending_service_map_bl.swap(m->service_map_bl); } diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc index ad8836dca38..1821a878c7a 100644 --- a/src/mon/MonmapMonitor.cc +++ b/src/mon/MonmapMonitor.cc @@ -157,9 +157,8 @@ void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t) } //health - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); pending_map.check_health(&next); - encode_health(next, t); } class C_ApplyFeatures : public Context { diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc index 597aee4e144..cb79a35c86b 100644 --- a/src/mon/NVMeofGwMon.cc +++ b/src/mon/NVMeofGwMon.cc @@ -236,9 +236,8 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t) put_last_committed(t, pending_map.epoch); //health - health_check_map_t checks; + auto& checks = get_health_checks_pending_writeable(); pending_map.get_health_checks(&checks); - encode_health(checks, t); } void NVMeofGwMon::update_from_paxos(bool *need_bootstrap) @@ -251,7 +250,6 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap) bufferlist bl; int err = get_version(version, bl); ceph_assert(err == 0); - load_health(); auto p = bl.cbegin(); map.decode(p); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index b7698ef2540..fbf7123a844 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -741,7 +741,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) mapping_job.reset(); } - load_health(); /* * We will possibly have a stashed latest that *we* wrote, and we will @@ -2072,9 +2071,8 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) } // health - health_check_map_t next; + auto& next = get_health_checks_pending_writeable(); tmp.check_health(cct, &next); - encode_health(next, t); } int OSDMonitor::load_metadata(int osd, map& m, ostream *err) diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc index ce2c5977f2e..75853e05912 100644 --- a/src/mon/PaxosService.cc +++ b/src/mon/PaxosService.cc @@ -161,7 +161,7 @@ void PaxosService::refresh(bool *need_bootstrap) format_version = new_format; - update_from_paxos(need_bootstrap); + _update_from_paxos(need_bootstrap); } void PaxosService::post_refresh() @@ -221,7 +221,7 @@ void PaxosService::propose_pending() if (should_stash_full()) encode_full(t); - encode_pending(t); + _encode_pending(t); have_pending = false; if (format_version > 0) { @@ -338,7 +338,7 @@ void PaxosService::_active() if (mon.is_leader()) { dout(7) << __func__ << " creating new pending" << dendl; if (!have_pending) { - create_pending(); + _create_pending(); have_pending = true; } @@ -464,13 +464,34 @@ void PaxosService::trim(MonitorDBStore::TransactionRef t, } } -void PaxosService::load_health() +void PaxosService::_create_pending() { + dout(10) << __func__ << dendl; + health_checks.create_pending(); + dout(30) << __func__ << ": health_checks pending: " << health_checks << dendl; + create_pending(); +} + +void PaxosService::_encode_pending(MonitorDBStore::TransactionRef t) { + dout(10) << __func__ << dendl; + using ceph::encode; + encode_pending(t); + dout(30) << __func__ << ": health_checks encoding: " << health_checks << dendl; + auto const& pending = health_checks.get_pending_map(); + ceph::buffer::list bl; + encode(pending, bl); + t->put("health", service_name, bl); + mon.log_health(pending, health_checks.get_map(), t); +} + +void PaxosService::_update_from_paxos(bool* need_bootstrap) { + dout(10) << __func__ << dendl; bufferlist bl; mon.store->get("health", service_name, bl); if (bl.length()) { auto p = bl.cbegin(); - using ceph::decode; - decode(health_checks, p); + health_checks.decode(p); + dout(30) << __func__ << ": health_checks decoded: " << health_checks << dendl; } + update_from_paxos(need_bootstrap); } diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index cf149ae6985..ed5949e56b0 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -19,6 +19,7 @@ #include "Paxos.h" #include "Monitor.h" #include "MonitorDBStore.h" +#include "PaxosMap.h" /** * A Paxos Service is an abstraction that easily allows one to obtain an @@ -48,7 +49,7 @@ class PaxosService { * If we are or have queued anything for proposal, this variable will be true * until our proposal has been finished. */ - bool proposing; + bool proposing = false; bool need_immediate_propose = false; @@ -60,35 +61,44 @@ protected: * must keep its own version, if so they wish. This variable should be used * for that purpose. */ - version_t service_version; + version_t service_version = 0; - private: +private: /** * Event callback responsible for proposing our pending value once a timer * runs out and fires. */ - Context *proposal_timer; + Context *proposal_timer = nullptr; /** * If the implementation class has anything pending to be proposed to Paxos, * then have_pending should be true; otherwise, false. */ - bool have_pending; + bool have_pending = false; + +protected: + /** + * format of our state in RocksDB, 0 for default + */ + version_t format_version = 0; /** * health checks for this service - * - * Child must populate this during encode_pending() by calling encode_health(). */ - health_check_map_t health_checks; -protected: + PaxosMap health_checks; + /** - * format of our state in RocksDB, 0 for default + * The pending health check map. Only callable by the service itself. */ - version_t format_version; + health_check_map_t& get_health_checks_pending_writeable() { + return health_checks.get_pending_map_writeable(); + } public: - const health_check_map_t& get_health_checks() const { - return health_checks; + /** + * The current (i.e. not pending) health check map. + */ + health_check_map_t const& get_health_checks() const { + return health_checks.get_map(); } /** @@ -145,14 +155,7 @@ public: * @param name Our service's name. */ PaxosService(Monitor &mn, Paxos &p, std::string name) - : mon(mn), paxos(p), service_name(name), - proposing(false), - service_version(0), proposal_timer(0), have_pending(false), - format_version(0), - last_committed_name("last_committed"), - first_committed_name("first_committed"), - full_prefix_name("full"), full_latest_name("latest"), - cached_first_committed(0), cached_last_committed(0) + : mon(mn), paxos(p), service_name(std::move(name)), health_checks(mn, *this) { } @@ -212,6 +215,12 @@ private: */ void _active(); + void _create_pending(); + + void _encode_pending(MonitorDBStore::TransactionRef t); + + void _update_from_paxos(bool* need_bootstrap); + public: /** * Propose a new value through Paxos. @@ -435,16 +444,6 @@ public: */ virtual void tick() {} - void encode_health(const health_check_map_t& next, - MonitorDBStore::TransactionRef t) { - using ceph::encode; - ceph::buffer::list bl; - encode(next, bl); - t->put("health", service_name, bl); - mon.log_health(next, health_checks, t); - } - void load_health(); - /** * @defgroup PaxosService_h_store_keys Set of keys that are usually used on * all the services implementing this @@ -453,10 +452,10 @@ public: * mistakes. * @{ */ - const std::string last_committed_name; - const std::string first_committed_name; - const std::string full_prefix_name; - const std::string full_latest_name; + const std::string last_committed_name{"last_committed"}; + const std::string first_committed_name{"first_committed"}; + const std::string full_prefix_name{"full"}; + const std::string full_latest_name{"latest"}; /** * @} */ @@ -472,8 +471,8 @@ public: * and avoid the overhead. * @{ */ - version_t cached_first_committed; - version_t cached_last_committed; + version_t cached_first_committed = 0; + version_t cached_last_committed = 0; /** * @} */ diff --git a/src/mon/health_check.h b/src/mon/health_check.h index e6d1d759551..be8d9119185 100644 --- a/src/mon/health_check.h +++ b/src/mon/health_check.h @@ -3,12 +3,15 @@ #pragma once -#include +#include #include +#include #include "include/health.h" #include "include/utime.h" +#include "common/CanHasPrint.h" #include "common/Formatter.h" +#include "common/dout.h" struct health_check_t { health_status_t severity; @@ -39,6 +42,13 @@ struct health_check_t { return !(l == r); } + void print(std::ostream& os) const { + os << "hc" + << "(" << summary + << " count=" << count + << ")"; + } + void dump(ceph::Formatter *f, bool want_detail=true) const { f->dump_stream("severity") << severity; @@ -119,6 +129,17 @@ struct health_check_map_t { DENC_FINISH(p); } + static constexpr bool is_ephemeral() { + return true; + } + + void print(std::ostream& os) const { + os << "health_check_map_t" + << "(" + << checks + << ")"; + } + void dump(ceph::Formatter *f) const { for (auto& [code, check] : checks) { f->dump_object(code, check); -- 2.39.5