From: Adam King <adking@redhat.com>
Date: Thu, 2 Dec 2021 13:27:10 +0000 (-0500)
Subject: mgr/cephadm: speed up upgrade when using agent
X-Git-Tag: v17.1.0~136^2~1
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bf2c4408aeb9e2b92d495c09514b73e59e8e5187;p=ceph.git

mgr/cephadm: speed up upgrade when using agent

Returning completely from the upgrade procedure when
any hosts at all don't have their metadata up-to-date
is unnecessarily strict. We can just skip daemons
specifically on hosts where the metadata is not up-to-date
and then just make sure the metadata is up-to-date on all
hosts before calling the upgrade complete

Fixes: https://tracker.ceph.com/issues/53598

Signed-off-by: Adam King <adking@redhat.com>
---

diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py
index c1609e042be4..e537e5978b73 100644
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -356,6 +356,12 @@ class CephadmAgentHelpers:
                 host, self.mgr.cache.agent_ports[host], {'counter': self.mgr.cache.agent_counter[host]}, self.mgr)
             message_thread.start()
 
+    def _request_ack_all_not_up_to_date(self) -> None:
+        self.mgr.agent_helpers._request_agent_acks(
+            set([h for h in self.mgr.cache.get_hosts() if
+                 (not self.mgr.cache.host_metadata_up_to_date(h)
+                 and h in self.mgr.cache.agent_ports and not self.mgr.cache.messaging_agent(h))]))
+
     def _agent_down(self, host: str) -> bool:
         # if host is draining or drained (has _no_schedule label) there should not
         # be an agent deployed there and therefore we should return False
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index 487d9c84d676..a075d8bfc5ab 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -521,13 +521,6 @@ class CephadmUpgrade:
         target_digests = self.upgrade_state.target_digests
         target_version = self.upgrade_state.target_version
 
-        if self.mgr.use_agent and not self.mgr.cache.all_host_metadata_up_to_date():
-            # need to wait for metadata to come in
-            self.mgr.agent_helpers._request_agent_acks(
-                set([h for h in self.mgr.cache.get_hosts() if
-                     (not self.mgr.cache.host_metadata_up_to_date(h) and h in self.mgr.cache.agent_ports and not self.mgr.cache.messaging_agent(h))]))
-            return
-
         first = False
         if not target_id or not target_version or not target_digests:
             # need to learn the container hash
@@ -612,6 +605,9 @@ class CephadmUpgrade:
                     continue
                 assert d.daemon_type is not None
                 assert d.daemon_id is not None
+                assert d.hostname is not None
+                if self.mgr.use_agent and not self.mgr.cache.host_metadata_up_to_date(d.hostname):
+                    continue
                 correct_digest = False
                 if (any(d in target_digests for d in (d.container_image_digests or []))
                         or d.daemon_type in MONITORING_STACK_TYPES):
@@ -827,8 +823,6 @@ class CephadmUpgrade:
                         'who': section,
                     })
 
-            logger.debug('Upgrade: All %s daemons are up to date.' % daemon_type)
-
             # complete osd upgrade?
             if daemon_type == 'osd':
                 osdmap = self.mgr.get("osd_map")
@@ -881,6 +875,13 @@ class CephadmUpgrade:
                     self.upgrade_state.fs_original_allow_standby_replay = {}
                     self._save_upgrade_state()
 
+            # Make sure all metadata is up to date before saying we are done upgrading this daemon type
+            if self.mgr.use_agent and not self.mgr.cache.all_host_metadata_up_to_date():
+                self.mgr.agent_helpers._request_ack_all_not_up_to_date()
+                return
+
+            logger.debug('Upgrade: All %s daemons are up to date.' % daemon_type)
+
         # clean up
         logger.info('Upgrade: Finalizing container_image settings')
         self.mgr.set_container_image('global', target_image)