]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: continue trying all hosts for service deployment 36622/head
authorDan Mick <dmick@redhat.com>
Thu, 6 Aug 2020 02:00:57 +0000 (02:00 +0000)
committerDan Mick <dmick@redhat.com>
Wed, 23 Sep 2020 03:35:44 +0000 (03:35 +0000)
One failure should not stop the attempt to start all instances.

Fixes: https://tracker.ceph.com/issues/46665
Signed-off-by: Dan Mick <dmick@redhat.com>
src/pybind/mgr/cephadm/module.py

index a36a2206500619bae4f409f729324cc698beb1a1..6bcf546f5031eacb53dc9ce8732b7d71950ef694 100644 (file)
@@ -2185,6 +2185,8 @@ To check that the host is reachable:
         if daemon_type == 'osd':
             self.osd_service.create_from_spec(cast(DriveGroupSpec, spec))
             # TODO: return True would result in a busy loop
+            # can't know if daemon count changed; create_from_spec doesn't
+            # return a solid indication
             return False
 
         daemons = self.cache.get_daemons_by_service(service_name)
@@ -2218,7 +2220,7 @@ To check that the host is reachable:
         hosts: List[HostPlacementSpec] = ha.place()
         self.log.debug('Usable hosts: %s' % hosts)
 
-        r = False
+        r = None
 
         # sanity check
         if daemon_type in ['mon', 'mgr'] and len(hosts) < 1:
@@ -2252,9 +2254,19 @@ To check that the host is reachable:
             self.log.debug('Placing %s.%s on host %s' % (
                 daemon_type, daemon_id, host))
 
-            daemon_spec = self.cephadm_services[daemon_type].prepare_create(daemon_spec)
-
-            self._create_daemon(daemon_spec)
+            try:
+                daemon_spec = self.cephadm_services[daemon_type].prepare_create(daemon_spec)
+                self._create_daemon(daemon_spec)
+                r = True
+            except (RuntimeError, OrchestratorError) as e:
+                self.events.for_service(spec, 'ERROR',
+                    f"Failed while placing {daemon_type}.{daemon_id}"
+                    "on {host}: {e}")
+                # only return "no change" if no one else has already succeeded.
+                # later successes will also change to True
+                if r is None:
+                    r = False
+                continue
 
             # add to daemon list so next name(s) will also be unique
             sd = orchestrator.DaemonDescription(
@@ -2263,7 +2275,6 @@ To check that the host is reachable:
                 daemon_id=daemon_id,
             )
             daemons.append(sd)
-            r = True
 
         # remove any?
         def _ok_to_stop(remove_daemon_hosts: Set[orchestrator.DaemonDescription]) -> bool:
@@ -2275,11 +2286,13 @@ To check that the host is reachable:
             # let's find a subset that is ok-to-stop
             remove_daemon_hosts.pop()
         for d in remove_daemon_hosts:
+            r = True
             # NOTE: we are passing the 'force' flag here, which means
             # we can delete a mon instances data.
             self._remove_daemon(d.name(), d.hostname)
-            r = True
 
+        if r is None:
+            r = False
         return r
 
     def _apply_all_services(self):