From: Kamoltat Sirivadhna <ksirivad@redhat.com>
Date: Tue, 9 Sep 2025 11:31:48 +0000 (+0000)
Subject: pybind/mgr/progress: refractor Global Recovery Event
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=4705f2d8d5b2215be614efd315217e2dfd3dd8ad;p=ceph-ci.git

pybind/mgr/progress: refractor Global Recovery Event

1. rename def _refresh() to def _persist() for _persist is more true to what the function actually does.
2. rename pg_stats_active_clean Active Py Module objects such that it is more intuitive and more readable.
3. refractor def global_event_update_progres for better readability.

Fixes: https://tracker.ceph.com/issues/72857

Signed-off-by: Kamoltat Sirivadhna <ksirivad@redhat.com>
---

diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc
index b770d24bf94..66e92f11e7f 100644
--- a/src/mgr/ActivePyModules.cc
+++ b/src/mgr/ActivePyModules.cc
@@ -495,28 +495,31 @@ PyObject *ActivePyModules::get_python(const std::string &what)
     f.close_section();
   } else if (what == "have_local_config_map") {
     f.dump_bool("have_local_config_map", have_local_config_map);
-  } else if (what == "active_clean_pgs"){
+  } else if (what == "pg_stats_active_clean"){
     without_gil_t no_gil;
     cluster_state.with_pgmap(
         [&](const PGMap &pg_map) {
       no_gil.acquire_gil();
-      f.open_array_section("pg_stats");
+      f.open_array_section("active_clean_pgs");
+      int active_clean_count = 0;
       for (auto &i : pg_map.pg_stat) {
         const auto state = i.second.state;
-	const auto pgid_raw = i.first;
-	const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed);
-	const auto reported_epoch = i.second.reported_epoch;
-	if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) {
-	  f.open_object_section("pg_stat");
-	  f.dump_string("pgid", pgid);
-	  f.dump_string("state", pg_state_string(state));
-	  f.dump_unsigned("reported_epoch", reported_epoch);
-	  f.close_section();
-	}
+	      const auto pgid_raw = i.first;
+	      const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed);
+	      const auto reported_epoch = i.second.reported_epoch;
+	      if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) {
+          active_clean_count++;
+          f.open_object_section("");
+	        f.dump_string("pgid", pgid);
+	        f.dump_string("state", pg_state_string(state));
+	        f.dump_unsigned("reported_epoch", reported_epoch);
+	        f.close_section();
+	      }
       }
       f.close_section();
+      f.dump_unsigned("active_clean_pg_count", active_clean_count);
       const auto num_pg = pg_map.num_pg;
-      f.dump_unsigned("total_num_pgs", num_pg);
+      f.dump_unsigned("total_pg_count", num_pg);
     });
   } else {
     derr << "Python module requested unknown data '" << what << "'" << dendl;
diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py
index 7c98200faa6..f1085fac7af 100644
--- a/src/pybind/mgr/progress/module.py
+++ b/src/pybind/mgr/progress/module.py
@@ -39,10 +39,13 @@ class Event(object):
         self.id = id
         self._add_to_ceph_s = add_to_ceph_s
 
-    def _refresh(self):
+    def _persist(self) -> None:
+        """
+        Persist the event to the Monitor
+        """
         global _module
         assert _module
-        _module.log.debug('refreshing mgr for %s (%s) at %f' % (self.id, self._message,
+        _module.log.debug('persisting event %s (%s) at %f' % (self.id, self._message,
                                                                 self.progress))
         _module.update_progress_event(
             self.id, self.twoline_progress(6), self.progress, self._add_to_ceph_s)
@@ -192,7 +195,7 @@ class GlobalRecoveryEvent(Event):
         self._progress = 0.0
         self._start_epoch = start_epoch
         self._active_clean_num = active_clean_num
-        self._refresh()
+        self._persist()
 
     def global_event_update_progress(self, log):
         # type: (logging.Logger) -> None
@@ -200,11 +203,11 @@ class GlobalRecoveryEvent(Event):
         global _module
         assert _module
         skipped_pgs = 0
-        active_clean_pgs = _module.get("active_clean_pgs")
-        total_pg_num = active_clean_pgs["total_num_pgs"]
-        new_active_clean_pgs = active_clean_pgs["pg_stats"]
-        new_active_clean_num = len(new_active_clean_pgs)
-        for pg in new_active_clean_pgs:
+        pg_stats_active_clean = _module.get("pg_stats_active_clean")
+        total_pg_num = pg_stats_active_clean["total_pg_count"]
+        active_clean_pgs = pg_stats_active_clean["active_clean_pgs"]
+        active_clean_pg_count = pg_stats_active_clean["active_clean_pg_count"]
+        for pg in active_clean_pgs:
             # Disregard PGs that are not being reported
             # if the states are active+clean. Since it is
             # possible that some pgs might not have any movement
@@ -214,21 +217,26 @@ class GlobalRecoveryEvent(Event):
                              .format(pg['pgid'], pg['reported_epoch'], self._start_epoch))
                 skipped_pgs += 1
                 continue
+        start = self._active_clean_num
+        current = active_clean_pg_count
+        total = total_pg_num - skipped_pgs
+        if (current == total):
+            # All active+clean PGs have been recovered
+            self._progress = 1.0
+            self._persist()
+            return
 
-        if self._active_clean_num != new_active_clean_num:
-            # Have this case to know when need to update
-            # the progress
-            try:
-                # Might be that total_pg_num is 0
-                self._progress = float(new_active_clean_num) / (total_pg_num - skipped_pgs)
-            except ZeroDivisionError:
-                self._progress = 0.0
-        else:
-            # No need to update since there is no change
+        denominator = (start - total)
+        if denominator == 0:
+            # start == target, No need to track anymore
+            self._progress = 1.0
+            self._persist()
             return
 
+        self._progress = (start - current) / denominator
+
         log.debug("Updated progress to %s", self.summary())
-        self._refresh()
+        self._persist()
 
     @property
     def progress(self):
@@ -247,22 +255,22 @@ class RemoteEvent(Event):
         super().__init__(my_id, message, refs, add_to_ceph_s)
         self._progress = 0.0
         self._failed = False
-        self._refresh()
+        self._persist()
 
     def set_progress(self, progress):
         # type: (float) -> None
         self._progress = progress
-        self._refresh()
+        self._persist()
 
     def set_failed(self, message):
         self._progress = 1.0
         self._failed = True
         self._failure_message = message
-        self._refresh()
+        self._persist()
 
     def set_message(self, message):
         self._message = message
-        self._refresh()
+        self._persist()
 
     @property
     def progress(self):
@@ -295,7 +303,7 @@ class PgRecoveryEvent(Event):
         self._progress = 0.0
 
         self._start_epoch = start_epoch
-        self._refresh()
+        self._persist()
 
     @property
     def which_osds(self):
@@ -386,7 +394,7 @@ class PgRecoveryEvent(Event):
 
         self._progress = min(max(prog, 0.0), 1.0)
 
-        self._refresh()
+        self._persist()
         log.info("Updated progress to %s", self.summary())
 
     @property
@@ -597,23 +605,23 @@ class Module(MgrModule):
         # This function both constructs and updates
         # the global recovery event if one of the
         # PGs is not at active+clean state
-        active_clean_pgs = self.get("active_clean_pgs")
-        total_pg_num = active_clean_pgs["total_num_pgs"]
-        active_clean_num = len(active_clean_pgs["pg_stats"])
+        pg_stats_active_clean = _module.get("pg_stats_active_clean")
+        total_pg_count = pg_stats_active_clean["total_pg_count"]
+        active_clean_pg_count = pg_stats_active_clean["active_clean_pg_count"]
         try:
             # There might be a case where there is no pg_num
-            progress = float(active_clean_num) / total_pg_num
+            progress = float(active_clean_pg_count) / total_pg_count
         except ZeroDivisionError:
             return
         if progress < 1.0:
             self.log.warning(("Starting Global Recovery Event,"
                               "%d pgs not in active + clean state"),
-                              total_pg_num - active_clean_num)
+                              total_pg_count - active_clean_pg_count)
             ev = GlobalRecoveryEvent("Global Recovery Event",
                     refs=[("global", "")],
                     add_to_ceph_s=True,
                     start_epoch=self.get_osdmap().get_epoch(),
-                    active_clean_num=active_clean_num)
+                    active_clean_num=active_clean_pg_count)
             ev.global_event_update_progress(self.log)
             self._events[ev.id] = ev