]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: continue trying all hosts for service deployment
authorDan Mick <dmick@redhat.com>
Thu, 6 Aug 2020 02:00:57 +0000 (02:00 +0000)
committerSebastian Wagner <sebastian.wagner@suse.com>
Wed, 18 Nov 2020 10:52:17 +0000 (11:52 +0100)
One failure should not stop the attempt to start all instances.

Fixes: https://tracker.ceph.com/issues/46665
Signed-off-by: Dan Mick <dmick@redhat.com>
(cherry picked from commit dd7bd4807a1668e1fb91581c69bb549ae62611db)

src/pybind/mgr/cephadm/module.py

index 3b2266626808336343aa4303a4f5efaeb63dfc81..9f9e0540276f8b68ec052eba8a72615ce8621146 100644 (file)
@@ -2194,6 +2194,8 @@ To check that the host is reachable:
         if daemon_type == 'osd':
             self.osd_service.create_from_spec(cast(DriveGroupSpec, spec))
             # TODO: return True would result in a busy loop
+            # can't know if daemon count changed; create_from_spec doesn't
+            # return a solid indication
             return False
 
         daemons = self.cache.get_daemons_by_service(service_name)
@@ -2227,7 +2229,7 @@ To check that the host is reachable:
         hosts: List[HostPlacementSpec] = ha.place()
         self.log.debug('Usable hosts: %s' % hosts)
 
-        r = False
+        r = None
 
         # sanity check
         if daemon_type in ['mon', 'mgr'] and len(hosts) < 1:
@@ -2261,9 +2263,19 @@ To check that the host is reachable:
             self.log.debug('Placing %s.%s on host %s' % (
                 daemon_type, daemon_id, host))
 
-            daemon_spec = self.cephadm_services[daemon_type].prepare_create(daemon_spec)
-
-            self._create_daemon(daemon_spec)
+            try:
+                daemon_spec = self.cephadm_services[daemon_type].prepare_create(daemon_spec)
+                self._create_daemon(daemon_spec)
+                r = True
+            except (RuntimeError, OrchestratorError) as e:
+                self.events.for_service(spec, 'ERROR',
+                    f"Failed while placing {daemon_type}.{daemon_id}"
+                    "on {host}: {e}")
+                # only return "no change" if no one else has already succeeded.
+                # later successes will also change to True
+                if r is None:
+                    r = False
+                continue
 
             # add to daemon list so next name(s) will also be unique
             sd = orchestrator.DaemonDescription(
@@ -2272,7 +2284,6 @@ To check that the host is reachable:
                 daemon_id=daemon_id,
             )
             daemons.append(sd)
-            r = True
 
         # remove any?
         def _ok_to_stop(remove_daemon_hosts: Set[orchestrator.DaemonDescription]) -> bool:
@@ -2284,11 +2295,13 @@ To check that the host is reachable:
             # let's find a subset that is ok-to-stop
             remove_daemon_hosts.pop()
         for d in remove_daemon_hosts:
+            r = True
             # NOTE: we are passing the 'force' flag here, which means
             # we can delete a mon instances data.
             self._remove_daemon(d.name(), d.hostname)
-            r = True
 
+        if r is None:
+            r = False
         return r
 
     def _apply_all_services(self):