From 6cc1b8142f87e8d4f05fb96540dcf590abd10c50 Mon Sep 17 00:00:00 2001 From: Cory Snyder Date: Thu, 29 Jul 2021 16:08:19 -0400 Subject: [PATCH] mgr/cephadm: fix exceptions causing stuck progress indicators Added a try block to ensure that progress of applying a service spec is updated as failed in the case of exceptions. Fixes: https://tracker.ceph.com/issues/51961 Signed-off-by: Cory Snyder (cherry picked from commit f247a7bed24756ba8a6f02579c98813931497421) --- src/pybind/mgr/cephadm/serve.py | 224 ++++++++++++++++---------------- 1 file changed, 114 insertions(+), 110 deletions(-) diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 12bda65e59071..420b560fc1321 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -736,124 +736,128 @@ class CephadmServe: self.log.debug('Hosts that will receive new daemons: %s' % slots_to_add) self.log.debug('Daemons that will be removed: %s' % daemons_to_remove) - # assign names - for i in range(len(slots_to_add)): - slot = slots_to_add[i] - slot = slot.assign_name(self.mgr.get_unique_name( - slot.daemon_type, - slot.hostname, - daemons, - prefix=spec.service_id, - forcename=slot.name, - rank=slot.rank, - rank_generation=slot.rank_generation, - )) - slots_to_add[i] = slot - if rank_map is not None: - assert slot.rank is not None - assert slot.rank_generation is not None - assert rank_map[slot.rank][slot.rank_generation] is None - rank_map[slot.rank][slot.rank_generation] = slot.name - - if rank_map: - # record the rank_map before we make changes so that if we fail the - # next mgr will clean up. - self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map) - - # remove daemons now, since we are going to fence them anyway - for d in daemons_to_remove: - assert d.hostname is not None - self._remove_daemon(d.name(), d.hostname) - daemons_to_remove = [] - - # fence them - svc.fence_old_ranks(spec, rank_map, len(all_slots)) - - # create daemons - for slot in slots_to_add: - # first remove daemon on conflicting port? - if slot.ports: + try: + # assign names + for i in range(len(slots_to_add)): + slot = slots_to_add[i] + slot = slot.assign_name(self.mgr.get_unique_name( + slot.daemon_type, + slot.hostname, + daemons, + prefix=spec.service_id, + forcename=slot.name, + rank=slot.rank, + rank_generation=slot.rank_generation, + )) + slots_to_add[i] = slot + if rank_map is not None: + assert slot.rank is not None + assert slot.rank_generation is not None + assert rank_map[slot.rank][slot.rank_generation] is None + rank_map[slot.rank][slot.rank_generation] = slot.name + + if rank_map: + # record the rank_map before we make changes so that if we fail the + # next mgr will clean up. + self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map) + + # remove daemons now, since we are going to fence them anyway for d in daemons_to_remove: - if d.hostname != slot.hostname: - continue - if not (set(d.ports or []) & set(slot.ports)): - continue - if d.ip and slot.ip and d.ip != slot.ip: - continue - self.log.info( - f'Removing {d.name()} before deploying to {slot} to avoid a port conflict' - ) - # NOTE: we don't check ok-to-stop here to avoid starvation if - # there is only 1 gateway. + assert d.hostname is not None self._remove_daemon(d.name(), d.hostname) - daemons_to_remove.remove(d) + daemons_to_remove = [] + + # fence them + svc.fence_old_ranks(spec, rank_map, len(all_slots)) + + # create daemons + for slot in slots_to_add: + # first remove daemon on conflicting port? + if slot.ports: + for d in daemons_to_remove: + if d.hostname != slot.hostname: + continue + if not (set(d.ports or []) & set(slot.ports)): + continue + if d.ip and slot.ip and d.ip != slot.ip: + continue + self.log.info( + f'Removing {d.name()} before deploying to {slot} to avoid a port conflict' + ) + # NOTE: we don't check ok-to-stop here to avoid starvation if + # there is only 1 gateway. + self._remove_daemon(d.name(), d.hostname) + daemons_to_remove.remove(d) + progress_done += 1 + break + + # deploy new daemon + daemon_id = slot.name + if not did_config: + svc.config(spec, daemon_id) + did_config = True + + daemon_spec = svc.make_daemon_spec( + slot.hostname, daemon_id, slot.network, spec, + daemon_type=slot.daemon_type, + ports=slot.ports, + ip=slot.ip, + rank=slot.rank, + rank_generation=slot.rank_generation, + ) + self.log.debug('Placing %s.%s on host %s' % ( + slot.daemon_type, daemon_id, slot.hostname)) + + try: + daemon_spec = svc.prepare_create(daemon_spec) + self._create_daemon(daemon_spec) + r = True progress_done += 1 - break - - # deploy new daemon - daemon_id = slot.name - if not did_config: - svc.config(spec, daemon_id) - did_config = True - - daemon_spec = svc.make_daemon_spec( - slot.hostname, daemon_id, slot.network, spec, - daemon_type=slot.daemon_type, - ports=slot.ports, - ip=slot.ip, - rank=slot.rank, - rank_generation=slot.rank_generation, - ) - self.log.debug('Placing %s.%s on host %s' % ( - slot.daemon_type, daemon_id, slot.hostname)) + update_progress() + except (RuntimeError, OrchestratorError) as e: + msg = (f"Failed while placing {slot.daemon_type}.{daemon_id} " + f"on {slot.hostname}: {e}") + self.mgr.events.for_service(spec, 'ERROR', msg) + self.mgr.log.error(msg) + # only return "no change" if no one else has already succeeded. + # later successes will also change to True + if r is None: + r = False + progress_done += 1 + update_progress() + continue - try: - daemon_spec = svc.prepare_create(daemon_spec) - self._create_daemon(daemon_spec) + # add to daemon list so next name(s) will also be unique + sd = orchestrator.DaemonDescription( + hostname=slot.hostname, + daemon_type=slot.daemon_type, + daemon_id=daemon_id, + ) + daemons.append(sd) + + # remove any? + def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool: + daemon_ids = [d.daemon_id for d in remove_daemons] + assert None not in daemon_ids + # setting force flag retains previous behavior + r = svc.ok_to_stop(cast(List[str], daemon_ids), force=True) + return not r.retval + + while daemons_to_remove and not _ok_to_stop(daemons_to_remove): + # let's find a subset that is ok-to-stop + daemons_to_remove.pop() + for d in daemons_to_remove: r = True + assert d.hostname is not None + self._remove_daemon(d.name(), d.hostname) + progress_done += 1 update_progress() - except (RuntimeError, OrchestratorError) as e: - msg = (f"Failed while placing {slot.daemon_type}.{daemon_id} " - f"on {slot.hostname}: {e}") - self.mgr.events.for_service(spec, 'ERROR', msg) - self.mgr.log.error(msg) - # only return "no change" if no one else has already succeeded. - # later successes will also change to True - if r is None: - r = False - progress_done += 1 - update_progress() - continue - - # add to daemon list so next name(s) will also be unique - sd = orchestrator.DaemonDescription( - hostname=slot.hostname, - daemon_type=slot.daemon_type, - daemon_id=daemon_id, - ) - daemons.append(sd) - - # remove any? - def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool: - daemon_ids = [d.daemon_id for d in remove_daemons] - assert None not in daemon_ids - # setting force flag retains previous behavior - r = svc.ok_to_stop(cast(List[str], daemon_ids), force=True) - return not r.retval - - while daemons_to_remove and not _ok_to_stop(daemons_to_remove): - # let's find a subset that is ok-to-stop - daemons_to_remove.pop() - for d in daemons_to_remove: - r = True - assert d.hostname is not None - self._remove_daemon(d.name(), d.hostname) - - progress_done += 1 - update_progress() - self.mgr.remote('progress', 'complete', progress_id) + self.mgr.remote('progress', 'complete', progress_id) + except Exception as e: + self.mgr.remote('progress', 'fail', progress_id, str(e)) + raise if r is None: r = False -- 2.39.5