From 49451d547ca2637c8afd5375870320da0c5953bf Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Tue, 13 Jul 2021 19:06:44 +0000 Subject: [PATCH] pybind/mgr/progress: introduce 5 second sleep interval Current progress module only checks pg stats and osdmap when it is notified by the cluster. However, this is expensive in large cluster with many pools and osds. we change it to only check both pg stats and osdmap every 5 seconds. in the function _osd_in_out() we now calculate `is_relocated` by: old_osds != new_osds such that it does not matter if the difference between osds are positive or negative. Signed-off-by: Kamoltat (cherry picked from commit 4504749b81f9cb11d92d5f280565aff3f243adf3) --- src/pybind/mgr/progress/module.py | 99 ++++++++++++++----------------- 1 file changed, 45 insertions(+), 54 deletions(-) diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py index 3aab7514f3a25..1f6162e2c8bdd 100644 --- a/src/pybind/mgr/progress/module.py +++ b/src/pybind/mgr/progress/module.py @@ -443,10 +443,10 @@ class Module(MgrModule): runtime=True ), Option( - 'persist_interval', + 'sleep_interval', default=5, type='secs', - desc='how frequently to persist completed events', + desc='how long the module is going to sleep', runtime=True ), Option( @@ -477,7 +477,7 @@ class Module(MgrModule): # only for mypy if TYPE_CHECKING: self.max_completed_events = 0 - self.persist_interval = 0 + self.sleep_interval = 0 self.enabled = True def config_notify(self): @@ -492,7 +492,6 @@ class Module(MgrModule): # A function that will create or complete an event when an # OSD is marked in or out according to the affected PGs affected_pgs = [] - unmoved_pgs = [] for pool in old_dump['pools']: pool_id = pool['pool'] # type: str for ps in range(0, pool['pg_num']): @@ -521,11 +520,9 @@ class Module(MgrModule): # Has this OSD been assigned a new location? # (it might not be if there is no suitable place to move # after an OSD is marked in/out) - if marked == "in": - is_relocated = len(old_osds - new_osds) > 0 - else: - is_relocated = len(new_osds - old_osds) > 0 - + + is_relocated = old_osds != new_osds + self.log.debug( "new_up_acting: {0}".format(json.dumps(new_up_acting, indent=4, @@ -534,15 +531,9 @@ class Module(MgrModule): if was_on_out_or_in_osd and is_relocated: # This PG is now in motion, track its progress affected_pgs.append(PgId(pool_id, ps)) - elif not is_relocated: - # This PG didn't get a new location, we'll log it - unmoved_pgs.append(PgId(pool_id, ps)) # In the case that we ignored some PGs, log the reason why (we may # not end up creating a progress event) - if len(unmoved_pgs): - self.log.warning("{0} PGs were on osd.{1}, but didn't get new locations".format( - len(unmoved_pgs), osd_id)) self.log.warning("{0} PGs affected by osd.{1} being marked {2}".format( len(affected_pgs), osd_id, marked)) @@ -627,45 +618,41 @@ class Module(MgrModule): ev.global_event_update_progress(self.get('pg_stats'), self.log) self._events[ev.id] = ev - def notify(self, notify_type, notify_data): - self._ready.wait() - if not self.enabled: + def _process_osdmap(self): + old_osdmap = self._latest_osdmap + self._latest_osdmap = self.get_osdmap() + assert old_osdmap + assert self._latest_osdmap + self.log.info(("Processing OSDMap change %d..%d"), + old_osdmap.get_epoch(), self._latest_osdmap.get_epoch()) + + self._osdmap_changed(old_osdmap, self._latest_osdmap) + + def _process_pg_summary(self): + # if there are no events we will skip this here to avoid + # expensive get calls + if len(self._events) == 0: return - if notify_type == "osd_map": - old_osdmap = self._latest_osdmap - self._latest_osdmap = self.get_osdmap() - assert old_osdmap - assert self._latest_osdmap - - self.log.info(("Processing OSDMap change %d..%d"), - old_osdmap.get_epoch(), self._latest_osdmap.get_epoch()) - - self._osdmap_changed(old_osdmap, self._latest_osdmap) - elif notify_type == "pg_summary": - # if there are no events we will skip this here to avoid - # expensive get calls - if len(self._events) == 0: - return - - global_event = False - data = self.get("pg_stats") - ready = self.get("pg_ready") - for ev_id in list(self._events): - ev = self._events[ev_id] - # Check for types of events - # we have to update - if isinstance(ev, PgRecoveryEvent): - ev.pg_update(data, ready, self.log) - self.maybe_complete(ev) - elif isinstance(ev, GlobalRecoveryEvent): - global_event = True - ev.global_event_update_progress(data, self.log) - self.maybe_complete(ev) - - if not global_event: - # If there is no global event - # we create one - self._pg_state_changed(data) + + global_event = False + data = self.get("pg_stats") + ready = self.get("pg_ready") + for ev_id in list(self._events): + ev = self._events[ev_id] + # Check for types of events + # we have to update + if isinstance(ev, PgRecoveryEvent): + ev.pg_update(data, ready, self.log) + self.maybe_complete(ev) + elif isinstance(ev, GlobalRecoveryEvent): + global_event = True + ev.global_event_update_progress(data, self.log) + self.maybe_complete(ev) + + if not global_event: + # If there is no global event + # we create one + self._pg_state_changed(data) def maybe_complete(self, event): # type: (Event) -> None @@ -736,7 +723,11 @@ class Module(MgrModule): self._save() self._dirty = False - self._shutdown.wait(timeout=self.persist_interval) + if self.enabled: + self._process_osdmap() + self._process_pg_summary() + + self._shutdown.wait(timeout=self.sleep_interval) self._shutdown.wait() -- 2.39.5