From fa92db1b37e5633e89fc39a4653c39973bf23867 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Mon, 5 Oct 2020 09:38:35 +0000 Subject: [PATCH] mgr/progress: optimize global recovery module Instead of fetching `pg_stats` from the python part of manager module, we filter out the pgs that are in active + clean state in ActivePyModules.cc then parse these pgs along with `reported_epoch` and the `total_num_pgs` of the clusters to global recovery module. Signed-off-by: Kamoltat --- src/mgr/ActivePyModules.cc | 25 ++++++++++++- src/pybind/mgr/progress/module.py | 58 ++++++++++++------------------- 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index e1179d4debb4a..27b1baf8d1edb 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -21,7 +21,7 @@ #include "osd/OSDMap.h" #include "mon/MonMap.h" - +#include "osd/osd_types.h" #include "mgr/MgrContext.h" // For ::mgr_store_prefix @@ -446,6 +446,29 @@ PyObject *ActivePyModules::get_python(const std::string &what) with_gil_t with_gil{no_gil}; f.dump_bool("have_local_config_map", have_local_config_map); return f.get(); + } else if (what == "active_clean_pgs"){ + cluster_state.with_pgmap( + [&](const PGMap &pg_map) { + with_gil_t with_gil{no_gil}; + f.open_array_section("pg_stats"); + for (auto &i : pg_map.pg_stat) { + const auto state = i.second.state; + const auto pgid_raw = i.first; + const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed); + const auto reported_epoch = i.second.reported_epoch; + if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) { + f.open_object_section("pg_stat"); + f.dump_string("pgid", pgid); + f.dump_string("state", pg_state_string(state)); + f.dump_unsigned("reported_epoch", reported_epoch); + f.close_section(); + } + } + f.close_section(); + const auto num_pg = pg_map.num_pg; + f.dump_unsigned("total_num_pgs", num_pg); + }); + return f.get(); } else { derr << "Python module requested unknown data '" << what << "'" << dendl; with_gil_t with_gil{no_gil}; diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py index 1f6162e2c8bdd..2ec69971c896a 100644 --- a/src/pybind/mgr/progress/module.py +++ b/src/pybind/mgr/progress/module.py @@ -194,33 +194,27 @@ class GlobalRecoveryEvent(Event): self._active_clean_num = active_clean_num self._refresh() - def global_event_update_progress(self, pg_dump, log): - # type: (Dict, logging.Logger) -> None + def global_event_update_progress(self, log): + # type: (logging.Logger) -> None "Update progress of Global Recovery Event" - pgs = pg_dump['pg_stats'] - new_active_clean_num = 0 + global _module + assert _module skipped_pgs = 0 - - for pg in pgs: + active_clean_pgs = _module.get("active_clean_pgs") + total_pg_num = active_clean_pgs["total_num_pgs"] + new_active_clean_pgs = active_clean_pgs["pg_stats"] + new_active_clean_num = len(new_active_clean_pgs) + for pg in new_active_clean_pgs: # Disregard PGs that are not being reported # if the states are active+clean. Since it is # possible that some pgs might not have any movement # even before the start of the event. - - state = pg['state'] - - states = state.split("+") if pg['reported_epoch'] < self._start_epoch: - if "active" in states and "clean" in states: - log.debug("Skipping pg {0} since reported_epoch {1} < start_epoch {2}" - .format(pg['pgid'], pg['reported_epoch'], self._start_epoch)) - skipped_pgs += 1 + log.debug("Skipping pg {0} since reported_epoch {1} < start_epoch {2}" + .format(pg['pgid'], pg['reported_epoch'], self._start_epoch)) + skipped_pgs += 1 continue - if "active" in states and "clean" in states: - new_active_clean_num += 1 - - total_pg_num = len(pgs) if self._active_clean_num != new_active_clean_num: # Have this case to know when need to update # the progress @@ -229,6 +223,9 @@ class GlobalRecoveryEvent(Event): self._progress = float(new_active_clean_num) / (total_pg_num - skipped_pgs) except ZeroDivisionError: self._progress = 0.0 + else: + # No need to update since there is no change + return log.debug("Updated progress to %s", self.summary()) self._refresh() @@ -296,7 +293,7 @@ class PgRecoveryEvent(Event): self._original_pg_count = len(self._pgs) self._original_bytes_recovered = None # type: Optional[Dict[PgId, float]] self._progress = 0.0 - # self._start_epoch = _module.get_osdmap().get_epoch() + self._start_epoch = start_epoch self._refresh() @@ -584,23 +581,14 @@ class Module(MgrModule): self.log.warning("osd.{0} marked in".format(osd_id)) self._osd_in_out(old_osdmap, old_dump, new_osdmap, osd_id, "in") - def _pg_state_changed(self, pg_dump): + def _pg_state_changed(self): # This function both constructs and updates # the global recovery event if one of the # PGs is not at active+clean state - - pgs = pg_dump['pg_stats'] - total_pg_num = len(pgs) - active_clean_num = 0 - for pg in pgs: - state = pg['state'] - # TODO right here we can keep track of epoch as well - # and parse it to global_event_update_progress() - states = state.split("+") - - if "active" in states and "clean" in states: - active_clean_num += 1 + active_clean_pgs = self.get("active_clean_pgs") + total_pg_num = active_clean_pgs["total_num_pgs"] + active_clean_num = len(active_clean_pgs["pg_stats"]) try: # There might be a case where there is no pg_num progress = float(active_clean_num) / total_pg_num @@ -615,7 +603,7 @@ class Module(MgrModule): add_to_ceph_s=True, start_epoch=self.get_osdmap().get_epoch(), active_clean_num=active_clean_num) - ev.global_event_update_progress(self.get('pg_stats'), self.log) + ev.global_event_update_progress(self.log) self._events[ev.id] = ev def _process_osdmap(self): @@ -646,13 +634,13 @@ class Module(MgrModule): self.maybe_complete(ev) elif isinstance(ev, GlobalRecoveryEvent): global_event = True - ev.global_event_update_progress(data, self.log) + ev.global_event_update_progress(self.log) self.maybe_complete(ev) if not global_event: # If there is no global event # we create one - self._pg_state_changed(data) + self._pg_state_changed() def maybe_complete(self, event): # type: (Event) -> None -- 2.39.5