]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: better handling of offline hosts with agent
authorAdam King <adking@redhat.com>
Wed, 1 Sep 2021 15:03:16 +0000 (11:03 -0400)
committerAdam King <adking@redhat.com>
Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/agent.py
src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/serve.py

index cdb42d4bc7293bbe6f526dccc9fc9de19c8864dd..9b16a4af7eb28b66e654556c2bf4e99d5be88c56 100644 (file)
@@ -106,6 +106,9 @@ class HostData:
         except Exception as e:
             self.mgr.log.warning(f'Received bad metadata from an agent: {e}')
         else:
+            # if we got here, we've already verified the keyring of the agent. If
+            # host agent is reporting on is marked offline, it shouldn't be any more
+            self.mgr.offline_hosts_remove(data['host'])
             self.handle_metadata(data)
 
     def check_request_fields(self, data: Dict[str, Any]) -> None:
@@ -271,9 +274,12 @@ class CephadmAgentHelpers:
 
     def _agent_down(self, host: str) -> bool:
         # if we don't have a timestamp, it's likely because of a mgr fail over.
-        # just set the timestamp to now
+        # just set the timestamp to now. However, if host was offline before, we
+        # should not allow creating a new timestamp to cause it to be marked online
         if host not in self.mgr.cache.agent_timestamp:
             self.mgr.cache.agent_timestamp[host] = datetime_now()
+            if host in self.mgr.offline_hosts:
+                return False
         # agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it.
         time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host]
         if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate):
index 3ea558607aaa765a0a0a853e1796a5c60efa99c2..fd41ebb2c4f1ffdc4e9d6ad6a856b08244cb3dd9 100644 (file)
@@ -947,7 +947,12 @@ class HostCache():
         return True
 
     def all_host_metadata_up_to_date(self) -> bool:
-        if [h for h in self.get_hosts() if not self.host_metadata_up_to_date(h)]:
+        unreachables = [h.hostname for h in self.mgr._unreachable_hosts()]
+        if [h for h in self.get_hosts() if (not self.host_metadata_up_to_date(h) and h not in unreachables)]:
+            # this function is primarily for telling if it's safe to try and apply a service
+            # spec. Since offline/maintenance hosts aren't considered in that process anyway
+            # we don't want to return False if the host without up-to-date metadata is in one
+            # of those two categories.
             return False
         return True
 
index e11a0e7e8e2fab74b74be88d60d59d99ca65d2e9..50c29e2f43902896abdf58a2c783b77aae076fea 100644 (file)
@@ -266,6 +266,8 @@ class CephadmServe:
                 return
 
             if self.mgr.use_agent and self.mgr.agent_helpers._agent_down(host):
+                agents_down.append(host)
+                self.mgr.cache.metadata_up_to_date[host] = False
                 if host in self.mgr.offline_hosts:
                     return
                 self.mgr.offline_hosts.add(host)
@@ -273,9 +275,9 @@ class CephadmServe:
                 # a long timeout trying to use an existing connection to an offline host
                 # REVISIT AFTER https://github.com/ceph/ceph/pull/42919
                 # self.mgr.ssh._reset_con(host)
-                agents_down.append(host)
-                # try to schedule redeploy of agent in case it is individually down
+
                 try:
+                    # try to schedule redeploy of agent in case it is individually down
                     agent = [a for a in self.mgr.cache.get_daemons_by_host(
                         host) if a.hostname == host][0]
                     self.mgr._schedule_daemon_action(agent.name(), 'redeploy')
@@ -283,8 +285,6 @@ class CephadmServe:
                     self.log.debug(
                         f'Failed to find entry for agent deployed on host {host}. Agent possibly never deployed: {e}')
                 return
-            elif self.mgr.use_agent:
-                self.mgr.offline_hosts_remove(host)
 
             if self.mgr.cache.host_needs_check(host):
                 r = self._check_host(host)