From: Adam King Date: Thu, 2 Dec 2021 13:27:10 +0000 (-0500) Subject: mgr/cephadm: speed up upgrade when using agent X-Git-Tag: v17.1.0~136^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bf2c4408aeb9e2b92d495c09514b73e59e8e5187;p=ceph.git mgr/cephadm: speed up upgrade when using agent Returning completely from the upgrade procedure when any hosts at all don't have their metadata up-to-date is unnecessarily strict. We can just skip daemons specifically on hosts where the metadata is not up-to-date and then just make sure the metadata is up-to-date on all hosts before calling the upgrade complete Fixes: https://tracker.ceph.com/issues/53598 Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index c1609e042be4..e537e5978b73 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -356,6 +356,12 @@ class CephadmAgentHelpers: host, self.mgr.cache.agent_ports[host], {'counter': self.mgr.cache.agent_counter[host]}, self.mgr) message_thread.start() + def _request_ack_all_not_up_to_date(self) -> None: + self.mgr.agent_helpers._request_agent_acks( + set([h for h in self.mgr.cache.get_hosts() if + (not self.mgr.cache.host_metadata_up_to_date(h) + and h in self.mgr.cache.agent_ports and not self.mgr.cache.messaging_agent(h))])) + def _agent_down(self, host: str) -> bool: # if host is draining or drained (has _no_schedule label) there should not # be an agent deployed there and therefore we should return False diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 487d9c84d676..a075d8bfc5ab 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -521,13 +521,6 @@ class CephadmUpgrade: target_digests = self.upgrade_state.target_digests target_version = self.upgrade_state.target_version - if self.mgr.use_agent and not self.mgr.cache.all_host_metadata_up_to_date(): - # need to wait for metadata to come in - self.mgr.agent_helpers._request_agent_acks( - set([h for h in self.mgr.cache.get_hosts() if - (not self.mgr.cache.host_metadata_up_to_date(h) and h in self.mgr.cache.agent_ports and not self.mgr.cache.messaging_agent(h))])) - return - first = False if not target_id or not target_version or not target_digests: # need to learn the container hash @@ -612,6 +605,9 @@ class CephadmUpgrade: continue assert d.daemon_type is not None assert d.daemon_id is not None + assert d.hostname is not None + if self.mgr.use_agent and not self.mgr.cache.host_metadata_up_to_date(d.hostname): + continue correct_digest = False if (any(d in target_digests for d in (d.container_image_digests or [])) or d.daemon_type in MONITORING_STACK_TYPES): @@ -827,8 +823,6 @@ class CephadmUpgrade: 'who': section, }) - logger.debug('Upgrade: All %s daemons are up to date.' % daemon_type) - # complete osd upgrade? if daemon_type == 'osd': osdmap = self.mgr.get("osd_map") @@ -881,6 +875,13 @@ class CephadmUpgrade: self.upgrade_state.fs_original_allow_standby_replay = {} self._save_upgrade_state() + # Make sure all metadata is up to date before saying we are done upgrading this daemon type + if self.mgr.use_agent and not self.mgr.cache.all_host_metadata_up_to_date(): + self.mgr.agent_helpers._request_ack_all_not_up_to_date() + return + + logger.debug('Upgrade: All %s daemons are up to date.' % daemon_type) + # clean up logger.info('Upgrade: Finalizing container_image settings') self.mgr.set_container_image('global', target_image)