]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: fix upgrade to version with agent
authorAdam King <adking@redhat.com>
Mon, 13 Sep 2021 17:20:30 +0000 (13:20 -0400)
committerAdam King <adking@redhat.com>
Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/agent.py
src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/upgrade.py

index 694dd1cff311729f84fee346a373941c3e21c5cb..b99e7e9bdc49784c94c778019e65c2bf3755425f 100644 (file)
@@ -171,7 +171,8 @@ class HostData:
                 up_to_date = True
             else:
                 # we got old counter value with message, inform agent of new timestamp
-                self.mgr.agent_helpers._request_agent_acks({host})
+                if not self.mgr.cache.messaging_agent(host):
+                    self.mgr.agent_helpers._request_agent_acks({host})
                 self.mgr.log.info(
                     f'Received old metadata from agent on host {host}. Requested up-to-date metadata.')
 
@@ -206,6 +207,7 @@ class AgentMessageThread(threading.Thread):
 
     def run(self) -> None:
         self.mgr.log.info(f'Sending message to agent on host {self.host}')
+        self.mgr.cache.sending_agent_message[self.host] = True
         try:
             assert self.mgr.cherrypy_thread
             root_cert = self.mgr.cherrypy_thread.ssl_certs.get_root_cert()
@@ -232,6 +234,7 @@ class AgentMessageThread(threading.Thread):
             ssl_ctx.load_cert_chain(cert_fname, key_fname)
         except Exception as e:
             self.mgr.log.error(f'Failed to get certs for connecting to agent: {e}')
+            self.mgr.cache.sending_agent_message[self.host] = False
             return
         try:
             bytes_len: str = str(len(self.data.encode('utf-8')))
@@ -242,6 +245,7 @@ class AgentMessageThread(threading.Thread):
                 bytes_len = '0' + bytes_len
         except Exception as e:
             self.mgr.log.error(f'Failed to get length of json payload: {e}')
+            self.mgr.cache.sending_agent_message[self.host] = False
             return
         for retry_wait in [3, 5]:
             try:
@@ -262,8 +266,10 @@ class AgentMessageThread(threading.Thread):
             except Exception as e:
                 # if it's not a connection error, something has gone wrong. Give up.
                 self.mgr.log.error(f'Failed to contact agent on host {self.host}: {e}')
+                self.mgr.cache.sending_agent_message[self.host] = False
                 return
         self.mgr.log.error(f'Could not connect to agent on host {self.host}')
+        self.mgr.cache.sending_agent_message[self.host] = False
         return
 
 
@@ -273,7 +279,8 @@ class CephadmAgentHelpers:
 
     def _request_agent_acks(self, hosts: Set[str], increment: bool = False) -> None:
         for host in hosts:
-            self.mgr.cache.metadata_up_to_date[host] = False
+            if increment:
+                self.mgr.cache.metadata_up_to_date[host] = False
             if host not in self.mgr.cache.agent_counter:
                 self.mgr.cache.agent_counter[host] = 1
             elif increment:
index fd41ebb2c4f1ffdc4e9d6ad6a856b08244cb3dd9..a637f4ae0b9109e3a505b84d845e5380a1e79ad4 100644 (file)
@@ -460,6 +460,7 @@ class HostCache():
         self.metadata_up_to_date = {}  # type: Dict[str, bool]
         self.agent_keys = {}  # type: Dict[str, str]
         self.agent_ports = {}  # type: Dict[str, int]
+        self.sending_agent_message = {}  # type: Dict[str, bool]
 
     def load(self):
         # type: () -> None
@@ -941,6 +942,11 @@ class HostCache():
             return True
         return False
 
+    def messaging_agent(self, host: str) -> bool:
+        if host not in self.sending_agent_message or not self.sending_agent_message[host]:
+            return False
+        return True
+
     def host_metadata_up_to_date(self, host: str) -> bool:
         if host not in self.metadata_up_to_date or not self.metadata_up_to_date[host]:
             return False
index 32787a2c8349f2d61d8c09494fa8d58c3b360319..49d254e6b4f497df6817bcf9a2d24a12db9ba22c 100644 (file)
@@ -101,8 +101,6 @@ class CephadmServe:
                                 self.mgr._schedule_daemon_action(agent.name(), 'redeploy')
                         if 'agent' not in self.mgr.spec_store:
                             self.mgr.agent_helpers._apply_agent()
-                        for host in self.mgr.cache.get_hosts():
-                            self.mgr.cache.metadata_up_to_date[host] = False
                     else:
                         if 'agent' in self.mgr.spec_store:
                             self.mgr.spec_store.rm('agent')
index 7fc2596d2fc6135a5058857cbf2047e516483ad4..d7162941985d3fcdb799f52458651dc5f46cfee3 100644 (file)
@@ -472,7 +472,8 @@ class CephadmUpgrade:
         if self.mgr.use_agent and not self.mgr.cache.all_host_metadata_up_to_date():
             # need to wait for metadata to come in
             self.mgr.agent_helpers._request_agent_acks(
-                set([h for h in self.mgr.cache.get_hosts() if not self.mgr.cache.host_metadata_up_to_date(h)]))
+                set([h for h in self.mgr.cache.get_hosts() if
+                     (not self.mgr.cache.host_metadata_up_to_date(h) and h in self.mgr.cache.agent_ports and not self.mgr.cache.messaging_agent(h))]))
             return
 
         first = False
@@ -688,6 +689,7 @@ class CephadmUpgrade:
                         'redeploy',
                         image=target_image if not d_entry[1] else None
                     )
+                    self.mgr.cache.metadata_up_to_date[d.hostname] = False
                 except Exception as e:
                     self._fail_upgrade('UPGRADE_REDEPLOY_DAEMON', {
                         'severity': 'warning',