From f15e1ec2559ee972689c91d6d3660ee0c37f3330 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Wed, 1 Sep 2021 11:03:16 -0400
Subject: [PATCH] mgr/cephadm: better handling of offline hosts with agent

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/agent.py     | 8 +++++++-
 src/pybind/mgr/cephadm/inventory.py | 7 ++++++-
 src/pybind/mgr/cephadm/serve.py     | 8 ++++----
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py
index cdb42d4bc7293..9b16a4af7eb28 100644
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -106,6 +106,9 @@ class HostData:
         except Exception as e:
             self.mgr.log.warning(f'Received bad metadata from an agent: {e}')
         else:
+            # if we got here, we've already verified the keyring of the agent. If
+            # host agent is reporting on is marked offline, it shouldn't be any more
+            self.mgr.offline_hosts_remove(data['host'])
             self.handle_metadata(data)
 
     def check_request_fields(self, data: Dict[str, Any]) -> None:
@@ -271,9 +274,12 @@ class CephadmAgentHelpers:
 
     def _agent_down(self, host: str) -> bool:
         # if we don't have a timestamp, it's likely because of a mgr fail over.
-        # just set the timestamp to now
+        # just set the timestamp to now. However, if host was offline before, we
+        # should not allow creating a new timestamp to cause it to be marked online
         if host not in self.mgr.cache.agent_timestamp:
             self.mgr.cache.agent_timestamp[host] = datetime_now()
+            if host in self.mgr.offline_hosts:
+                return False
         # agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it.
         time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host]
         if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate):
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py
index 3ea558607aaa7..fd41ebb2c4f1f 100644
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -947,7 +947,12 @@ class HostCache():
         return True
 
     def all_host_metadata_up_to_date(self) -> bool:
-        if [h for h in self.get_hosts() if not self.host_metadata_up_to_date(h)]:
+        unreachables = [h.hostname for h in self.mgr._unreachable_hosts()]
+        if [h for h in self.get_hosts() if (not self.host_metadata_up_to_date(h) and h not in unreachables)]:
+            # this function is primarily for telling if it's safe to try and apply a service
+            # spec. Since offline/maintenance hosts aren't considered in that process anyway
+            # we don't want to return False if the host without up-to-date metadata is in one
+            # of those two categories.
             return False
         return True
 
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index e11a0e7e8e2fa..50c29e2f43902 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -266,6 +266,8 @@ class CephadmServe:
                 return
 
             if self.mgr.use_agent and self.mgr.agent_helpers._agent_down(host):
+                agents_down.append(host)
+                self.mgr.cache.metadata_up_to_date[host] = False
                 if host in self.mgr.offline_hosts:
                     return
                 self.mgr.offline_hosts.add(host)
@@ -273,9 +275,9 @@ class CephadmServe:
                 # a long timeout trying to use an existing connection to an offline host
                 # REVISIT AFTER https://github.com/ceph/ceph/pull/42919
                 # self.mgr.ssh._reset_con(host)
-                agents_down.append(host)
-                # try to schedule redeploy of agent in case it is individually down
+
                 try:
+                    # try to schedule redeploy of agent in case it is individually down
                     agent = [a for a in self.mgr.cache.get_daemons_by_host(
                         host) if a.hostname == host][0]
                     self.mgr._schedule_daemon_action(agent.name(), 'redeploy')
@@ -283,8 +285,6 @@ class CephadmServe:
                     self.log.debug(
                         f'Failed to find entry for agent deployed on host {host}. Agent possibly never deployed: {e}')
                 return
-            elif self.mgr.use_agent:
-                self.mgr.offline_hosts_remove(host)
 
             if self.mgr.cache.host_needs_check(host):
                 r = self._check_host(host)
-- 
2.39.5