]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: fix exceptions causing stuck progress indicators
authorCory Snyder <csnyder@iland.com>
Thu, 29 Jul 2021 20:08:19 +0000 (16:08 -0400)
committerSebastian Wagner <sewagner@redhat.com>
Tue, 10 Aug 2021 14:32:16 +0000 (16:32 +0200)
Added a try block to ensure that progress of applying a service spec is updated as failed in the case of exceptions.

Fixes: https://tracker.ceph.com/issues/51961
Signed-off-by: Cory Snyder <csnyder@iland.com>
(cherry picked from commit f247a7bed24756ba8a6f02579c98813931497421)

src/pybind/mgr/cephadm/serve.py

index 12bda65e59071839683ea60e986852d8079efce5..420b560fc1321af7dbd16152f582ee9dc338d9e4 100644 (file)
@@ -736,124 +736,128 @@ class CephadmServe:
         self.log.debug('Hosts that will receive new daemons: %s' % slots_to_add)
         self.log.debug('Daemons that will be removed: %s' % daemons_to_remove)
 
-        # assign names
-        for i in range(len(slots_to_add)):
-            slot = slots_to_add[i]
-            slot = slot.assign_name(self.mgr.get_unique_name(
-                slot.daemon_type,
-                slot.hostname,
-                daemons,
-                prefix=spec.service_id,
-                forcename=slot.name,
-                rank=slot.rank,
-                rank_generation=slot.rank_generation,
-            ))
-            slots_to_add[i] = slot
-            if rank_map is not None:
-                assert slot.rank is not None
-                assert slot.rank_generation is not None
-                assert rank_map[slot.rank][slot.rank_generation] is None
-                rank_map[slot.rank][slot.rank_generation] = slot.name
-
-        if rank_map:
-            # record the rank_map before we make changes so that if we fail the
-            # next mgr will clean up.
-            self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map)
-
-            # remove daemons now, since we are going to fence them anyway
-            for d in daemons_to_remove:
-                assert d.hostname is not None
-                self._remove_daemon(d.name(), d.hostname)
-            daemons_to_remove = []
-
-            # fence them
-            svc.fence_old_ranks(spec, rank_map, len(all_slots))
-
-        # create daemons
-        for slot in slots_to_add:
-            # first remove daemon on conflicting port?
-            if slot.ports:
+        try:
+            # assign names
+            for i in range(len(slots_to_add)):
+                slot = slots_to_add[i]
+                slot = slot.assign_name(self.mgr.get_unique_name(
+                    slot.daemon_type,
+                    slot.hostname,
+                    daemons,
+                    prefix=spec.service_id,
+                    forcename=slot.name,
+                    rank=slot.rank,
+                    rank_generation=slot.rank_generation,
+                ))
+                slots_to_add[i] = slot
+                if rank_map is not None:
+                    assert slot.rank is not None
+                    assert slot.rank_generation is not None
+                    assert rank_map[slot.rank][slot.rank_generation] is None
+                    rank_map[slot.rank][slot.rank_generation] = slot.name
+
+            if rank_map:
+                # record the rank_map before we make changes so that if we fail the
+                # next mgr will clean up.
+                self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map)
+
+                # remove daemons now, since we are going to fence them anyway
                 for d in daemons_to_remove:
-                    if d.hostname != slot.hostname:
-                        continue
-                    if not (set(d.ports or []) & set(slot.ports)):
-                        continue
-                    if d.ip and slot.ip and d.ip != slot.ip:
-                        continue
-                    self.log.info(
-                        f'Removing {d.name()} before deploying to {slot} to avoid a port conflict'
-                    )
-                    # NOTE: we don't check ok-to-stop here to avoid starvation if
-                    # there is only 1 gateway.
+                    assert d.hostname is not None
                     self._remove_daemon(d.name(), d.hostname)
-                    daemons_to_remove.remove(d)
+                daemons_to_remove = []
+
+                # fence them
+                svc.fence_old_ranks(spec, rank_map, len(all_slots))
+
+            # create daemons
+            for slot in slots_to_add:
+                # first remove daemon on conflicting port?
+                if slot.ports:
+                    for d in daemons_to_remove:
+                        if d.hostname != slot.hostname:
+                            continue
+                        if not (set(d.ports or []) & set(slot.ports)):
+                            continue
+                        if d.ip and slot.ip and d.ip != slot.ip:
+                            continue
+                        self.log.info(
+                            f'Removing {d.name()} before deploying to {slot} to avoid a port conflict'
+                        )
+                        # NOTE: we don't check ok-to-stop here to avoid starvation if
+                        # there is only 1 gateway.
+                        self._remove_daemon(d.name(), d.hostname)
+                        daemons_to_remove.remove(d)
+                        progress_done += 1
+                        break
+
+                # deploy new daemon
+                daemon_id = slot.name
+                if not did_config:
+                    svc.config(spec, daemon_id)
+                    did_config = True
+
+                daemon_spec = svc.make_daemon_spec(
+                    slot.hostname, daemon_id, slot.network, spec,
+                    daemon_type=slot.daemon_type,
+                    ports=slot.ports,
+                    ip=slot.ip,
+                    rank=slot.rank,
+                    rank_generation=slot.rank_generation,
+                )
+                self.log.debug('Placing %s.%s on host %s' % (
+                    slot.daemon_type, daemon_id, slot.hostname))
+
+                try:
+                    daemon_spec = svc.prepare_create(daemon_spec)
+                    self._create_daemon(daemon_spec)
+                    r = True
                     progress_done += 1
-                    break
-
-            # deploy new daemon
-            daemon_id = slot.name
-            if not did_config:
-                svc.config(spec, daemon_id)
-                did_config = True
-
-            daemon_spec = svc.make_daemon_spec(
-                slot.hostname, daemon_id, slot.network, spec,
-                daemon_type=slot.daemon_type,
-                ports=slot.ports,
-                ip=slot.ip,
-                rank=slot.rank,
-                rank_generation=slot.rank_generation,
-            )
-            self.log.debug('Placing %s.%s on host %s' % (
-                slot.daemon_type, daemon_id, slot.hostname))
+                    update_progress()
+                except (RuntimeError, OrchestratorError) as e:
+                    msg = (f"Failed while placing {slot.daemon_type}.{daemon_id} "
+                           f"on {slot.hostname}: {e}")
+                    self.mgr.events.for_service(spec, 'ERROR', msg)
+                    self.mgr.log.error(msg)
+                    # only return "no change" if no one else has already succeeded.
+                    # later successes will also change to True
+                    if r is None:
+                        r = False
+                    progress_done += 1
+                    update_progress()
+                    continue
 
-            try:
-                daemon_spec = svc.prepare_create(daemon_spec)
-                self._create_daemon(daemon_spec)
+                # add to daemon list so next name(s) will also be unique
+                sd = orchestrator.DaemonDescription(
+                    hostname=slot.hostname,
+                    daemon_type=slot.daemon_type,
+                    daemon_id=daemon_id,
+                )
+                daemons.append(sd)
+
+            # remove any?
+            def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool:
+                daemon_ids = [d.daemon_id for d in remove_daemons]
+                assert None not in daemon_ids
+                # setting force flag retains previous behavior
+                r = svc.ok_to_stop(cast(List[str], daemon_ids), force=True)
+                return not r.retval
+
+            while daemons_to_remove and not _ok_to_stop(daemons_to_remove):
+                # let's find a subset that is ok-to-stop
+                daemons_to_remove.pop()
+            for d in daemons_to_remove:
                 r = True
+                assert d.hostname is not None
+                self._remove_daemon(d.name(), d.hostname)
+
                 progress_done += 1
                 update_progress()
-            except (RuntimeError, OrchestratorError) as e:
-                msg = (f"Failed while placing {slot.daemon_type}.{daemon_id} "
-                       f"on {slot.hostname}: {e}")
-                self.mgr.events.for_service(spec, 'ERROR', msg)
-                self.mgr.log.error(msg)
-                # only return "no change" if no one else has already succeeded.
-                # later successes will also change to True
-                if r is None:
-                    r = False
-                progress_done += 1
-                update_progress()
-                continue
-
-            # add to daemon list so next name(s) will also be unique
-            sd = orchestrator.DaemonDescription(
-                hostname=slot.hostname,
-                daemon_type=slot.daemon_type,
-                daemon_id=daemon_id,
-            )
-            daemons.append(sd)
-
-        # remove any?
-        def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool:
-            daemon_ids = [d.daemon_id for d in remove_daemons]
-            assert None not in daemon_ids
-            # setting force flag retains previous behavior
-            r = svc.ok_to_stop(cast(List[str], daemon_ids), force=True)
-            return not r.retval
-
-        while daemons_to_remove and not _ok_to_stop(daemons_to_remove):
-            # let's find a subset that is ok-to-stop
-            daemons_to_remove.pop()
-        for d in daemons_to_remove:
-            r = True
-            assert d.hostname is not None
-            self._remove_daemon(d.name(), d.hostname)
-
-            progress_done += 1
-            update_progress()
 
-        self.mgr.remote('progress', 'complete', progress_id)
+            self.mgr.remote('progress', 'complete', progress_id)
+        except Exception as e:
+            self.mgr.remote('progress', 'fail', progress_id, str(e))
+            raise
 
         if r is None:
             r = False