]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
pybind/mgr/progress: enforced try and except on accessing event dictionary 44671/head
authorKamoltat <ksirivad@redhat.com>
Wed, 12 Jan 2022 02:41:01 +0000 (02:41 +0000)
committerKamoltat <ksirivad@redhat.com>
Wed, 19 Jan 2022 15:34:06 +0000 (15:34 +0000)
There is a certain race condition scenario where
an event gets deleted while the progress module
iterates through the ``events`` dictionary,
without a ``try and except``, this will cause
an unhandled exception error and will crash
the module.

This commit will enforce ``try and except``
on every part of the code where we are accessing
the ``events`` dictionary.

Fixes: https://tracker.ceph.com/issues/53803
Signed-off-by: Kamoltat <ksirivad@redhat.com>
(cherry picked from commit b70d4a9caae0eb859e10b68f93573d507625d267)

src/pybind/mgr/progress/module.py

index 5f9aa86f647924a5be6c38ed9ab738c84e77419d..422aba962a1c0da4c63748f081dbe8e01b03d8e6 100644 (file)
@@ -539,12 +539,15 @@ class Module(MgrModule):
         # previous recovery event for that osd
         if marked == "in":
             for ev_id in list(self._events):
-                ev = self._events[ev_id]
-                if isinstance(ev, PgRecoveryEvent) and osd_id in ev.which_osds:
-                    self.log.info("osd.{0} came back in, cancelling event".format(
-                        osd_id
-                    ))
-                    self._complete(ev)
+                try:
+                    ev = self._events[ev_id]
+                    if isinstance(ev, PgRecoveryEvent) and osd_id in ev.which_osds:
+                        self.log.info("osd.{0} came back in, cancelling event".format(
+                            osd_id
+                        ))
+                        self._complete(ev)
+                except KeyError:
+                    self.log.warning("_osd_in_out: ev {0} does not exist".format(ev_id))
 
         if len(affected_pgs) > 0:
             r_ev = PgRecoveryEvent(
@@ -625,16 +628,20 @@ class Module(MgrModule):
         global_event = False
         data = self.get("pg_progress")
         for ev_id in list(self._events):
-            ev = self._events[ev_id]
-            # Check for types of events
-            # we have to update
-            if isinstance(ev, PgRecoveryEvent):
-                ev.pg_update(data, self.log)
-                self.maybe_complete(ev)
-            elif isinstance(ev, GlobalRecoveryEvent):
-                global_event = True
-                ev.global_event_update_progress(self.log)
-                self.maybe_complete(ev)
+            try:
+                ev = self._events[ev_id]
+                # Check for types of events
+                # we have to update
+                if isinstance(ev, PgRecoveryEvent):
+                    ev.pg_update(data, self.log)
+                    self.maybe_complete(ev)
+                elif isinstance(ev, GlobalRecoveryEvent):
+                    global_event = True
+                    ev.global_event_update_progress(self.log)
+                    self.maybe_complete(ev)
+            except KeyError:
+                self.log.warning("_process_pg_summary: ev {0} does not exist".format(ev_id))
+                continue
 
         if not global_event:
             # If there is no global event
@@ -736,6 +743,7 @@ class Module(MgrModule):
             ev = self._events[ev_id]
             assert isinstance(ev, RemoteEvent)
         except KeyError:
+            # if key doesn't exist we create an event
             ev = RemoteEvent(ev_id, ev_msg, refs, add_to_ceph_s)
             self._events[ev_id] = ev
             self.log.info("update: starting ev {0} ({1})".format(