From: Kamoltat Sirivadhna Date: Tue, 9 Sep 2025 11:31:48 +0000 (+0000) Subject: pybind/mgr/progress: refractor Global Recovery Event X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=4705f2d8d5b2215be614efd315217e2dfd3dd8ad;p=ceph-ci.git pybind/mgr/progress: refractor Global Recovery Event 1. rename def _refresh() to def _persist() for _persist is more true to what the function actually does. 2. rename pg_stats_active_clean Active Py Module objects such that it is more intuitive and more readable. 3. refractor def global_event_update_progres for better readability. Fixes: https://tracker.ceph.com/issues/72857 Signed-off-by: Kamoltat Sirivadhna --- diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index b770d24bf94..66e92f11e7f 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -495,28 +495,31 @@ PyObject *ActivePyModules::get_python(const std::string &what) f.close_section(); } else if (what == "have_local_config_map") { f.dump_bool("have_local_config_map", have_local_config_map); - } else if (what == "active_clean_pgs"){ + } else if (what == "pg_stats_active_clean"){ without_gil_t no_gil; cluster_state.with_pgmap( [&](const PGMap &pg_map) { no_gil.acquire_gil(); - f.open_array_section("pg_stats"); + f.open_array_section("active_clean_pgs"); + int active_clean_count = 0; for (auto &i : pg_map.pg_stat) { const auto state = i.second.state; - const auto pgid_raw = i.first; - const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed); - const auto reported_epoch = i.second.reported_epoch; - if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) { - f.open_object_section("pg_stat"); - f.dump_string("pgid", pgid); - f.dump_string("state", pg_state_string(state)); - f.dump_unsigned("reported_epoch", reported_epoch); - f.close_section(); - } + const auto pgid_raw = i.first; + const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed); + const auto reported_epoch = i.second.reported_epoch; + if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) { + active_clean_count++; + f.open_object_section(""); + f.dump_string("pgid", pgid); + f.dump_string("state", pg_state_string(state)); + f.dump_unsigned("reported_epoch", reported_epoch); + f.close_section(); + } } f.close_section(); + f.dump_unsigned("active_clean_pg_count", active_clean_count); const auto num_pg = pg_map.num_pg; - f.dump_unsigned("total_num_pgs", num_pg); + f.dump_unsigned("total_pg_count", num_pg); }); } else { derr << "Python module requested unknown data '" << what << "'" << dendl; diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py index 7c98200faa6..f1085fac7af 100644 --- a/src/pybind/mgr/progress/module.py +++ b/src/pybind/mgr/progress/module.py @@ -39,10 +39,13 @@ class Event(object): self.id = id self._add_to_ceph_s = add_to_ceph_s - def _refresh(self): + def _persist(self) -> None: + """ + Persist the event to the Monitor + """ global _module assert _module - _module.log.debug('refreshing mgr for %s (%s) at %f' % (self.id, self._message, + _module.log.debug('persisting event %s (%s) at %f' % (self.id, self._message, self.progress)) _module.update_progress_event( self.id, self.twoline_progress(6), self.progress, self._add_to_ceph_s) @@ -192,7 +195,7 @@ class GlobalRecoveryEvent(Event): self._progress = 0.0 self._start_epoch = start_epoch self._active_clean_num = active_clean_num - self._refresh() + self._persist() def global_event_update_progress(self, log): # type: (logging.Logger) -> None @@ -200,11 +203,11 @@ class GlobalRecoveryEvent(Event): global _module assert _module skipped_pgs = 0 - active_clean_pgs = _module.get("active_clean_pgs") - total_pg_num = active_clean_pgs["total_num_pgs"] - new_active_clean_pgs = active_clean_pgs["pg_stats"] - new_active_clean_num = len(new_active_clean_pgs) - for pg in new_active_clean_pgs: + pg_stats_active_clean = _module.get("pg_stats_active_clean") + total_pg_num = pg_stats_active_clean["total_pg_count"] + active_clean_pgs = pg_stats_active_clean["active_clean_pgs"] + active_clean_pg_count = pg_stats_active_clean["active_clean_pg_count"] + for pg in active_clean_pgs: # Disregard PGs that are not being reported # if the states are active+clean. Since it is # possible that some pgs might not have any movement @@ -214,21 +217,26 @@ class GlobalRecoveryEvent(Event): .format(pg['pgid'], pg['reported_epoch'], self._start_epoch)) skipped_pgs += 1 continue + start = self._active_clean_num + current = active_clean_pg_count + total = total_pg_num - skipped_pgs + if (current == total): + # All active+clean PGs have been recovered + self._progress = 1.0 + self._persist() + return - if self._active_clean_num != new_active_clean_num: - # Have this case to know when need to update - # the progress - try: - # Might be that total_pg_num is 0 - self._progress = float(new_active_clean_num) / (total_pg_num - skipped_pgs) - except ZeroDivisionError: - self._progress = 0.0 - else: - # No need to update since there is no change + denominator = (start - total) + if denominator == 0: + # start == target, No need to track anymore + self._progress = 1.0 + self._persist() return + self._progress = (start - current) / denominator + log.debug("Updated progress to %s", self.summary()) - self._refresh() + self._persist() @property def progress(self): @@ -247,22 +255,22 @@ class RemoteEvent(Event): super().__init__(my_id, message, refs, add_to_ceph_s) self._progress = 0.0 self._failed = False - self._refresh() + self._persist() def set_progress(self, progress): # type: (float) -> None self._progress = progress - self._refresh() + self._persist() def set_failed(self, message): self._progress = 1.0 self._failed = True self._failure_message = message - self._refresh() + self._persist() def set_message(self, message): self._message = message - self._refresh() + self._persist() @property def progress(self): @@ -295,7 +303,7 @@ class PgRecoveryEvent(Event): self._progress = 0.0 self._start_epoch = start_epoch - self._refresh() + self._persist() @property def which_osds(self): @@ -386,7 +394,7 @@ class PgRecoveryEvent(Event): self._progress = min(max(prog, 0.0), 1.0) - self._refresh() + self._persist() log.info("Updated progress to %s", self.summary()) @property @@ -597,23 +605,23 @@ class Module(MgrModule): # This function both constructs and updates # the global recovery event if one of the # PGs is not at active+clean state - active_clean_pgs = self.get("active_clean_pgs") - total_pg_num = active_clean_pgs["total_num_pgs"] - active_clean_num = len(active_clean_pgs["pg_stats"]) + pg_stats_active_clean = _module.get("pg_stats_active_clean") + total_pg_count = pg_stats_active_clean["total_pg_count"] + active_clean_pg_count = pg_stats_active_clean["active_clean_pg_count"] try: # There might be a case where there is no pg_num - progress = float(active_clean_num) / total_pg_num + progress = float(active_clean_pg_count) / total_pg_count except ZeroDivisionError: return if progress < 1.0: self.log.warning(("Starting Global Recovery Event," "%d pgs not in active + clean state"), - total_pg_num - active_clean_num) + total_pg_count - active_clean_pg_count) ev = GlobalRecoveryEvent("Global Recovery Event", refs=[("global", "")], add_to_ceph_s=True, start_epoch=self.get_osdmap().get_epoch(), - active_clean_num=active_clean_num) + active_clean_num=active_clean_pg_count) ev.global_event_update_progress(self.log) self._events[ev.id] = ev