pybind/mgr/progress: refractor Global Recovery Event

author Kamoltat Sirivadhna <ksirivad@redhat.com>

Tue, 9 Sep 2025 11:31:48 +0000 (11:31 +0000)

committer Kamoltat Sirivadhna <ksirivad@redhat.com>

Fri, 3 Oct 2025 20:23:31 +0000 (20:23 +0000)
author Kamoltat Sirivadhna <ksirivad@redhat.com>
Tue, 9 Sep 2025 11:31:48 +0000 (11:31 +0000)
committer Kamoltat Sirivadhna <ksirivad@redhat.com>
Fri, 3 Oct 2025 20:23:31 +0000 (20:23 +0000)
diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc

index b770d24bf943e0ccd77ea65bd84160f42529e87b..66e92f11e7fa8f3a73813f48787578c5113f2d9e 100644 (file)
--- a/src/mgr/ActivePyModules.cc
+++ b/src/mgr/ActivePyModules.cc
@@ -495,28 +495,31 @@ PyObject *ActivePyModules::get_python(const std::string &what)
      f.close_section();
    } else if (what == "have_local_config_map") {
      f.dump_bool("have_local_config_map", have_local_config_map);
-  } else if (what == "active_clean_pgs"){
+  } else if (what == "pg_stats_active_clean"){
      without_gil_t no_gil;
      cluster_state.with_pgmap(
          [&](const PGMap &pg_map) {
        no_gil.acquire_gil();
-      f.open_array_section("pg_stats");
+      f.open_array_section("active_clean_pgs");
+      int active_clean_count = 0;
        for (auto &i : pg_map.pg_stat) {
          const auto state = i.second.state;
-       const auto pgid_raw = i.first;
-       const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed);
-       const auto reported_epoch = i.second.reported_epoch;
-       if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) {
-         f.open_object_section("pg_stat");
-         f.dump_string("pgid", pgid);
-         f.dump_string("state", pg_state_string(state));
-         f.dump_unsigned("reported_epoch", reported_epoch);
-         f.close_section();
-       }
+             const auto pgid_raw = i.first;
+             const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed);
+             const auto reported_epoch = i.second.reported_epoch;
+             if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) {
+          active_clean_count++;
+          f.open_object_section("");
+               f.dump_string("pgid", pgid);
+               f.dump_string("state", pg_state_string(state));
+               f.dump_unsigned("reported_epoch", reported_epoch);
+               f.close_section();
+             }
        }
        f.close_section();
+      f.dump_unsigned("active_clean_pg_count", active_clean_count);
        const auto num_pg = pg_map.num_pg;
-      f.dump_unsigned("total_num_pgs", num_pg);
+      f.dump_unsigned("total_pg_count", num_pg);
      });
    } else {
      derr << "Python module requested unknown data '" << what << "'" << dendl;
diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py

index 7c98200faa6a7f1fc1f1d3936303ab1c5356ed98..f1085fac7af628bc858cf41f08c8df9289ba41e0 100644 (file)
--- a/src/pybind/mgr/progress/module.py
+++ b/src/pybind/mgr/progress/module.py
@@ -39,10 +39,13 @@ class Event(object):
          self.id = id
          self._add_to_ceph_s = add_to_ceph_s
  
-    def _refresh(self):
+    def _persist(self) -> None:
+        """
+        Persist the event to the Monitor
+        """
          global _module
          assert _module
-        _module.log.debug('refreshing mgr for %s (%s) at %f' % (self.id, self._message,
+        _module.log.debug('persisting event %s (%s) at %f' % (self.id, self._message,
                                                                  self.progress))
          _module.update_progress_event(
              self.id, self.twoline_progress(6), self.progress, self._add_to_ceph_s)
@@ -192,7 +195,7 @@ class GlobalRecoveryEvent(Event):
          self._progress = 0.0
          self._start_epoch = start_epoch
          self._active_clean_num = active_clean_num
-        self._refresh()
+        self._persist()
  
      def global_event_update_progress(self, log):
          # type: (logging.Logger) -> None
@@ -200,11 +203,11 @@ class GlobalRecoveryEvent(Event):
          global _module
          assert _module
          skipped_pgs = 0
-        active_clean_pgs = _module.get("active_clean_pgs")
-        total_pg_num = active_clean_pgs["total_num_pgs"]
-        new_active_clean_pgs = active_clean_pgs["pg_stats"]
-        new_active_clean_num = len(new_active_clean_pgs)
-        for pg in new_active_clean_pgs:
+        pg_stats_active_clean = _module.get("pg_stats_active_clean")
+        total_pg_num = pg_stats_active_clean["total_pg_count"]
+        active_clean_pgs = pg_stats_active_clean["active_clean_pgs"]
+        active_clean_pg_count = pg_stats_active_clean["active_clean_pg_count"]
+        for pg in active_clean_pgs:
              # Disregard PGs that are not being reported
              # if the states are active+clean. Since it is
              # possible that some pgs might not have any movement
@@ -214,21 +217,26 @@ class GlobalRecoveryEvent(Event):
                               .format(pg['pgid'], pg['reported_epoch'], self._start_epoch))
                  skipped_pgs += 1
                  continue
+        start = self._active_clean_num
+        current = active_clean_pg_count
+        total = total_pg_num - skipped_pgs
+        if (current == total):
+            # All active+clean PGs have been recovered
+            self._progress = 1.0
+            self._persist()
+            return
  
-        if self._active_clean_num != new_active_clean_num:
-            # Have this case to know when need to update
-            # the progress
-            try:
-                # Might be that total_pg_num is 0
-                self._progress = float(new_active_clean_num) / (total_pg_num - skipped_pgs)
-            except ZeroDivisionError:
-                self._progress = 0.0
-        else:
-            # No need to update since there is no change
+        denominator = (start - total)
+        if denominator == 0:
+            # start == target, No need to track anymore
+            self._progress = 1.0
+            self._persist()
              return
  
+        self._progress = (start - current) / denominator
+
          log.debug("Updated progress to %s", self.summary())
-        self._refresh()
+        self._persist()
  
      @property
      def progress(self):
@@ -247,22 +255,22 @@ class RemoteEvent(Event):
          super().__init__(my_id, message, refs, add_to_ceph_s)
          self._progress = 0.0
          self._failed = False
-        self._refresh()
+        self._persist()
  
      def set_progress(self, progress):
          # type: (float) -> None
          self._progress = progress
-        self._refresh()
+        self._persist()
  
      def set_failed(self, message):
          self._progress = 1.0
          self._failed = True
          self._failure_message = message
-        self._refresh()
+        self._persist()
  
      def set_message(self, message):
          self._message = message
-        self._refresh()
+        self._persist()
  
      @property
      def progress(self):
@@ -295,7 +303,7 @@ class PgRecoveryEvent(Event):
          self._progress = 0.0
  
          self._start_epoch = start_epoch
-        self._refresh()
+        self._persist()
  
      @property
      def which_osds(self):
@@ -386,7 +394,7 @@ class PgRecoveryEvent(Event):
  
          self._progress = min(max(prog, 0.0), 1.0)
  
-        self._refresh()
+        self._persist()
          log.info("Updated progress to %s", self.summary())
  
      @property
@@ -597,23 +605,23 @@ class Module(MgrModule):
          # This function both constructs and updates
          # the global recovery event if one of the
          # PGs is not at active+clean state
-        active_clean_pgs = self.get("active_clean_pgs")
-        total_pg_num = active_clean_pgs["total_num_pgs"]
-        active_clean_num = len(active_clean_pgs["pg_stats"])
+        pg_stats_active_clean = _module.get("pg_stats_active_clean")
+        total_pg_count = pg_stats_active_clean["total_pg_count"]
+        active_clean_pg_count = pg_stats_active_clean["active_clean_pg_count"]
          try:
              # There might be a case where there is no pg_num
-            progress = float(active_clean_num) / total_pg_num
+            progress = float(active_clean_pg_count) / total_pg_count
          except ZeroDivisionError:
              return
          if progress < 1.0:
              self.log.warning(("Starting Global Recovery Event,"
                                "%d pgs not in active + clean state"),
-                              total_pg_num - active_clean_num)
+                              total_pg_count - active_clean_pg_count)
              ev = GlobalRecoveryEvent("Global Recovery Event",
                      refs=[("global", "")],
                      add_to_ceph_s=True,
                      start_epoch=self.get_osdmap().get_epoch(),
-                    active_clean_num=active_clean_num)
+                    active_clean_num=active_clean_pg_count)
              ev.global_event_update_progress(self.log)
              self._events[ev.id] = ev
author	Kamoltat Sirivadhna <ksirivad@redhat.com>
	Tue, 9 Sep 2025 11:31:48 +0000 (11:31 +0000)
committer	Kamoltat Sirivadhna <ksirivad@redhat.com>
	Fri, 3 Oct 2025 20:23:31 +0000 (20:23 +0000)
src/mgr/ActivePyModules.cc		patch \| blob \| history
src/pybind/mgr/progress/module.py		patch \| blob \| history