]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
pybind/mgr/progress: introduce 5 second sleep interval
authorKamoltat <ksirivad@redhat.com>
Tue, 13 Jul 2021 19:06:44 +0000 (19:06 +0000)
committerKamoltat <ksirivad@redhat.com>
Thu, 30 Sep 2021 13:33:03 +0000 (13:33 +0000)
Current progress module only checks pg stats
and osdmap when it is notified by the cluster.
However, this is expensive in large cluster
with many pools and osds. we
change it to only check both pg stats and osdmap
every 5 seconds.

in the function _osd_in_out() we now calculate
`is_relocated` by: old_osds != new_osds such that
it does not matter if the difference between osds
are positive or negative.

Signed-off-by: Kamoltat <ksirivad@redhat.com>
(cherry picked from commit 4504749b81f9cb11d92d5f280565aff3f243adf3)

src/pybind/mgr/progress/module.py

index 3aab7514f3a2558ecd31592cd09627844a00b0e7..1f6162e2c8bdd5dbb90981b4b3756f24e7aa1fbb 100644 (file)
@@ -443,10 +443,10 @@ class Module(MgrModule):
             runtime=True
         ),
         Option(
-            'persist_interval',
+            'sleep_interval',
             default=5,
             type='secs',
-            desc='how frequently to persist completed events',
+            desc='how long the module is going to sleep',
             runtime=True
         ),
         Option(
@@ -477,7 +477,7 @@ class Module(MgrModule):
         # only for mypy
         if TYPE_CHECKING:
             self.max_completed_events = 0
-            self.persist_interval = 0
+            self.sleep_interval = 0
             self.enabled = True
 
     def config_notify(self):
@@ -492,7 +492,6 @@ class Module(MgrModule):
         # A function that will create or complete an event when an
         # OSD is marked in or out according to the affected PGs
         affected_pgs = []
-        unmoved_pgs = []
         for pool in old_dump['pools']:
             pool_id = pool['pool']  # type: str
             for ps in range(0, pool['pg_num']):
@@ -521,11 +520,9 @@ class Module(MgrModule):
                 # Has this OSD been assigned a new location?
                 # (it might not be if there is no suitable place to move
                 #  after an OSD is marked in/out)
-                if marked == "in":
-                    is_relocated = len(old_osds - new_osds) > 0
-                else:
-                    is_relocated = len(new_osds - old_osds) > 0
-
+                
+                is_relocated = old_osds != new_osds
+                
                 self.log.debug(
                     "new_up_acting: {0}".format(json.dumps(new_up_acting,
                                                            indent=4,
@@ -534,15 +531,9 @@ class Module(MgrModule):
                 if was_on_out_or_in_osd and is_relocated:
                     # This PG is now in motion, track its progress
                     affected_pgs.append(PgId(pool_id, ps))
-                elif not is_relocated:
-                    # This PG didn't get a new location, we'll log it
-                    unmoved_pgs.append(PgId(pool_id, ps))
 
         # In the case that we ignored some PGs, log the reason why (we may
         # not end up creating a progress event)
-        if len(unmoved_pgs):
-            self.log.warning("{0} PGs were on osd.{1}, but didn't get new locations".format(
-                len(unmoved_pgs), osd_id))
 
         self.log.warning("{0} PGs affected by osd.{1} being marked {2}".format(
             len(affected_pgs), osd_id, marked))
@@ -627,45 +618,41 @@ class Module(MgrModule):
             ev.global_event_update_progress(self.get('pg_stats'), self.log)
             self._events[ev.id] = ev
 
-    def notify(self, notify_type, notify_data):
-        self._ready.wait()
-        if not self.enabled:
+    def _process_osdmap(self):
+        old_osdmap = self._latest_osdmap
+        self._latest_osdmap = self.get_osdmap()
+        assert old_osdmap
+        assert self._latest_osdmap
+        self.log.info(("Processing OSDMap change %d..%d"),
+            old_osdmap.get_epoch(), self._latest_osdmap.get_epoch())
+
+        self._osdmap_changed(old_osdmap, self._latest_osdmap)
+
+    def _process_pg_summary(self):
+        # if there are no events we will skip this here to avoid
+        # expensive get calls
+        if len(self._events) == 0:
             return
-        if notify_type == "osd_map":
-            old_osdmap = self._latest_osdmap
-            self._latest_osdmap = self.get_osdmap()
-            assert old_osdmap
-            assert self._latest_osdmap
-
-            self.log.info(("Processing OSDMap change %d..%d"),
-                old_osdmap.get_epoch(), self._latest_osdmap.get_epoch())
-
-            self._osdmap_changed(old_osdmap, self._latest_osdmap)
-        elif notify_type == "pg_summary":
-            # if there are no events we will skip this here to avoid
-            # expensive get calls
-            if len(self._events) == 0:
-                return
-
-            global_event = False
-            data = self.get("pg_stats")
-            ready = self.get("pg_ready")
-            for ev_id in list(self._events):
-                ev = self._events[ev_id]
-                # Check for types of events
-                # we have to update
-                if isinstance(ev, PgRecoveryEvent):
-                    ev.pg_update(data, ready, self.log)
-                    self.maybe_complete(ev)
-                elif isinstance(ev, GlobalRecoveryEvent):
-                    global_event = True
-                    ev.global_event_update_progress(data, self.log)
-                    self.maybe_complete(ev)
-
-            if not global_event:
-                # If there is no global event
-                # we create one
-                self._pg_state_changed(data)
+
+        global_event = False
+        data = self.get("pg_stats")
+        ready = self.get("pg_ready")
+        for ev_id in list(self._events):
+            ev = self._events[ev_id]
+            # Check for types of events
+            # we have to update
+            if isinstance(ev, PgRecoveryEvent):
+                ev.pg_update(data, ready, self.log)
+                self.maybe_complete(ev)
+            elif isinstance(ev, GlobalRecoveryEvent):
+                global_event = True
+                ev.global_event_update_progress(data, self.log)
+                self.maybe_complete(ev)
+
+        if not global_event:
+            # If there is no global event
+            # we create one
+            self._pg_state_changed(data)
 
     def maybe_complete(self, event):
         # type: (Event) -> None
@@ -736,7 +723,11 @@ class Module(MgrModule):
                 self._save()
                 self._dirty = False
 
-            self._shutdown.wait(timeout=self.persist_interval)
+            if self.enabled:
+                self._process_osdmap()
+                self._process_pg_summary()
+
+            self._shutdown.wait(timeout=self.sleep_interval)
 
         self._shutdown.wait()