From: Shraddha Agrawal Date: Thu, 22 May 2025 10:26:41 +0000 (+0530) Subject: mon/MgrStatMonitor: ignore duration for which feature is off X-Git-Tag: testing/wip-khiremat-testing-20250628.025302-tentacle-debug~18^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=31fe915f77276a78f8a49611f9775b4fd775cc93;p=ceph-ci.git mon/MgrStatMonitor: ignore duration for which feature is off When the availability tracking feature is disabled, we should not be updating the score. We should start recalculating the score when the user enables the features again. Essentially, for the purpose of calculating the score, we need to ignore the duration for which the feature was turned off. The score is calculated from the uptime and downtime durations recorded in `pool_availability` object. These durations are updated in `calc_pool_availability` by adding the diff between last_uptime/ last_downtime and now. To discard the duration for which the feature was turned off, we need to offset the uptime/downtime by this duration. A simple way to do this is to update the last_uptime and last_downtime to the timestamp when the feature is toggled on again. To implement the same, we record the time at which the feature is toggled from off to on. When `calc_pool_availability` is invoked, if a reset is required, it resets last_uptime and last_downtime before proceeding with availability calculations. We only care about the state when the feature is toggled from off to on. All other toggle states for the config option will not have any effect on the score. Fixes: https://tracker.ceph.com/issues/71494 Signed-off-by: Shraddha Agrawal (cherry picked from commit d81d2af8fcd708f20f54f863dd613fade57af6e5) --- diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index da31318b441..224651a2eb6 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -71,8 +71,24 @@ void MgrStatMonitor::handle_conf_change( const ConfigProxy& conf, const std::set& changed) { - // implement changes here - dout(10) << __func__ << " enable_availability_tracking config option is changed." << dendl; + if (changed.count("enable_availability_tracking")) { + std::scoped_lock l(lock); + bool oldval = enable_availability_tracking; + bool newval = g_conf().get_val("enable_availability_tracking"); + dout(10) << __func__ << " enable_availability_tracking config option is changed from " + << oldval << " to " << newval + << dendl; + + // if fetaure is toggled from off to on, + // store the new value of last_uptime and last_downtime + // (to be updated in calc_pool_availability) + if (newval > oldval) { + reset_availability_last_uptime_downtime_val = ceph_clock_now(); + dout(10) << __func__ << " reset_availability_last_uptime_downtime_val " + << reset_availability_last_uptime_downtime_val << dendl; + } + enable_availability_tracking = newval; + } } void MgrStatMonitor::create_initial() @@ -88,14 +104,29 @@ void MgrStatMonitor::create_initial() void MgrStatMonitor::calc_pool_availability() { dout(20) << __func__ << dendl; + std::scoped_lock l(lock); // if feature is disabled by user, do not update the uptime // and downtime, exit early - if (!g_conf().get_val("enable_availability_tracking")) { + if (!enable_availability_tracking) { dout(20) << __func__ << " tracking availability score is disabled" << dendl; return; } + // if reset_availability_last_uptime_downtime_val is not utime_t(1, 2), + // update last_uptime and last_downtime for all pools to the + // recorded values + if (reset_availability_last_uptime_downtime_val.has_value()) { + for (const auto& i : pool_availability) { + const auto& poolid = i.first; + pool_availability[poolid].last_downtime = reset_availability_last_uptime_downtime_val.value(); + pool_availability[poolid].last_uptime = reset_availability_last_uptime_downtime_val.value(); + } + dout(20) << __func__ << " reset last_uptime and last_downtime to " + << reset_availability_last_uptime_downtime_val << dendl; + reset_availability_last_uptime_downtime_val.reset(); + } + auto pool_avail_end = pool_availability.end(); for (const auto& i : digest.pool_pg_unavailable_map) { const auto& poolid = i.first; diff --git a/src/mon/MgrStatMonitor.h b/src/mon/MgrStatMonitor.h index c02cbc4976b..5397a160087 100644 --- a/src/mon/MgrStatMonitor.h +++ b/src/mon/MgrStatMonitor.h @@ -28,6 +28,8 @@ public: MgrStatMonitor(Monitor &mn, Paxos &p, const std::string& service_name); ~MgrStatMonitor() override; + ceph::mutex lock = ceph::make_mutex("MgrStatMonitor::lock"); + void init() override {} void on_shutdown() override {} @@ -53,7 +55,9 @@ public: bool preprocess_statfs(MonOpRequestRef op); void calc_pool_availability(); - + bool enable_availability_tracking = g_conf().get_val("enable_availability_tracking"); ///< tracking availability score feature + std::optional reset_availability_last_uptime_downtime_val; + void check_sub(Subscription *sub); void check_subs(); void send_digests();