]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm/upgrade: restart multiple osds at once
authorSage Weil <sage@newdream.net>
Fri, 26 Feb 2021 16:53:54 +0000 (11:53 -0500)
committerSage Weil <sage@newdream.net>
Tue, 16 Mar 2021 12:56:17 +0000 (07:56 -0500)
Restart multiple osds in a single upgrade pass, when possible.

Signed-off-by: Sage Weil <sage@newdream.net>
(cherry picked from commit b3d0420e60bcaede1d253bd551f201e756cc3d9a)

src/pybind/mgr/cephadm/upgrade.py

index 189788cb28e49efb9c66bb5f744bd8906ae03f99..ed4bbb6c9a6219eeaa7267c564c6d48b5556b118 100644 (file)
@@ -225,7 +225,7 @@ class CephadmUpgrade:
 
             # setting force flag to retain old functionality.
             r = self.mgr.cephadm_services[daemon_type_to_service(s.daemon_type)].ok_to_stop([
-                s.daemon_id], force=True)
+                s.daemon_id], known=known, force=True)
 
             if not r.retval:
                 logger.info(f'Upgrade: {r.stdout}')
@@ -453,11 +453,42 @@ class CephadmUpgrade:
             ):
                 return
 
+            to_upgrade = []
+            known_ok_to_stop: List[str] = []
             for d in need_upgrade:
                 assert d.daemon_type is not None
                 assert d.daemon_id is not None
                 assert d.hostname is not None
 
+                if not d.container_image_id:
+                    if d.container_image_name == target_image:
+                        logger.debug(
+                            'daemon %s has unknown container_image_id but has correct image name' % (d.name()))
+                        continue
+
+                if known_ok_to_stop:
+                    if d.name() in known_ok_to_stop:
+                        logger.info(f'Upgrade: {d.name()} is also safe to restart')
+                        to_upgrade.append(d)
+                    continue
+
+                if not self._wait_for_ok_to_stop(d, known_ok_to_stop):
+                    return
+
+                to_upgrade.append(d)
+
+                # if we don't have a list of others to consider, stop now
+                if not known_ok_to_stop:
+                    break
+
+            num = 1
+            for d in to_upgrade:
+                assert d.daemon_type is not None
+                assert d.daemon_id is not None
+                assert d.hostname is not None
+
+                self._update_upgrade_progress(done / len(daemons))
+
                 # make sure host has latest container image
                 out, errs, code = CephadmServe(self.mgr)._run_cephadm(
                     d.hostname, '', 'inspect-image', [],
@@ -486,17 +517,12 @@ class CephadmUpgrade:
                         self._save_upgrade_state()
                         return
 
-                self._update_upgrade_progress(done / len(daemons))
-
-                if not d.container_image_id:
-                    if d.container_image_name == target_image:
-                        logger.debug(
-                            'daemon %s has unknown container_image_id but has correct image name' % (d.name()))
-                        continue
-                if not self._wait_for_ok_to_stop(d):
-                    return
-                logger.info('Upgrade: Updating %s.%s' %
-                            (d.daemon_type, d.daemon_id))
+                if len(to_upgrade) > 1:
+                    logger.info('Upgrade: Updating %s.%s (%d/%d)' %
+                                (d.daemon_type, d.daemon_id, num, len(to_upgrade)))
+                else:
+                    logger.info('Upgrade: Updating %s.%s' %
+                                (d.daemon_type, d.daemon_id))
                 try:
                     self.mgr._daemon_action(
                         d.daemon_type,
@@ -514,6 +540,9 @@ class CephadmUpgrade:
                             f'Upgrade daemon: {d.name()}: {e}'
                         ],
                     })
+                    return
+                num += 1
+            if to_upgrade:
                 return
 
             # complete mon upgrade?