From: Dan Mick Date: Thu, 6 Aug 2020 02:00:57 +0000 (+0000) Subject: mgr/cephadm: continue trying all hosts for service deployment X-Git-Tag: v15.2.8~14^2~39 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=56945ca0cda2fd0e6cbb07db619395bcd468713e;p=ceph.git mgr/cephadm: continue trying all hosts for service deployment One failure should not stop the attempt to start all instances. Fixes: https://tracker.ceph.com/issues/46665 Signed-off-by: Dan Mick (cherry picked from commit dd7bd4807a1668e1fb91581c69bb549ae62611db) --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 3b2266626808..9f9e0540276f 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2194,6 +2194,8 @@ To check that the host is reachable: if daemon_type == 'osd': self.osd_service.create_from_spec(cast(DriveGroupSpec, spec)) # TODO: return True would result in a busy loop + # can't know if daemon count changed; create_from_spec doesn't + # return a solid indication return False daemons = self.cache.get_daemons_by_service(service_name) @@ -2227,7 +2229,7 @@ To check that the host is reachable: hosts: List[HostPlacementSpec] = ha.place() self.log.debug('Usable hosts: %s' % hosts) - r = False + r = None # sanity check if daemon_type in ['mon', 'mgr'] and len(hosts) < 1: @@ -2261,9 +2263,19 @@ To check that the host is reachable: self.log.debug('Placing %s.%s on host %s' % ( daemon_type, daemon_id, host)) - daemon_spec = self.cephadm_services[daemon_type].prepare_create(daemon_spec) - - self._create_daemon(daemon_spec) + try: + daemon_spec = self.cephadm_services[daemon_type].prepare_create(daemon_spec) + self._create_daemon(daemon_spec) + r = True + except (RuntimeError, OrchestratorError) as e: + self.events.for_service(spec, 'ERROR', + f"Failed while placing {daemon_type}.{daemon_id}" + "on {host}: {e}") + # only return "no change" if no one else has already succeeded. + # later successes will also change to True + if r is None: + r = False + continue # add to daemon list so next name(s) will also be unique sd = orchestrator.DaemonDescription( @@ -2272,7 +2284,6 @@ To check that the host is reachable: daemon_id=daemon_id, ) daemons.append(sd) - r = True # remove any? def _ok_to_stop(remove_daemon_hosts: Set[orchestrator.DaemonDescription]) -> bool: @@ -2284,11 +2295,13 @@ To check that the host is reachable: # let's find a subset that is ok-to-stop remove_daemon_hosts.pop() for d in remove_daemon_hosts: + r = True # NOTE: we are passing the 'force' flag here, which means # we can delete a mon instances data. self._remove_daemon(d.name(), d.hostname) - r = True + if r is None: + r = False return r def _apply_all_services(self):