From 20989eae5760ae7928bab1c7e55488b6cea8acee Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Tue, 28 Sep 2021 08:50:34 -0400
Subject: [PATCH] mgr/cephadm: fix host drain with agent

Agent was not getting removed from hosts with _no_schedule label
since it was using all hosts rather than _schedulable hosts. Added
a _non_draining hosts function to return a prope list of hosts agent
is okay to schedule on.

Was stuck between using a daemon to report which daemons are on the
host and wanting to remove all the daemons on the host when draining.
Now using ssh to confirm all the daemons have left the host.

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/agent.py  |  4 ++++
 src/pybind/mgr/cephadm/module.py | 12 ++++++++++++
 src/pybind/mgr/cephadm/serve.py  |  7 ++++---
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py
index d43f639264855..cd9cd199f114c 100644
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -295,6 +295,10 @@ class CephadmAgentHelpers:
             message_thread.start()
 
     def _agent_down(self, host: str) -> bool:
+        # if host is draining or drained (has _no_schedule label) there should not
+        # be an agent deployed there and therefore we should return False
+        if host not in [h.hostname for h in self.mgr._non_draining_hosts()]:
+            return False
         # if we don't have a timestamp, it's likely because of a mgr fail over.
         # just set the timestamp to now. However, if host was offline before, we
         # should not allow creating a new timestamp to cause it to be marked online
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 5ddfa436f1a86..3cc37845754a9 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1391,6 +1391,18 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             )
         ]
 
+    def _non_draining_hosts(self) -> List[HostSpec]:
+        """
+        Returns all hosts that do not have _no_schedule label.
+
+        Useful for the agent who needs this specific list rather than the
+        _schedulable_hosts since the agent needs to be deployed on hosts with
+        no daemon refresh
+        """
+        return [
+            h for h in self.inventory.all_specs() if '_no_schedule' not in h.labels
+        ]
+
     def _unreachable_hosts(self) -> List[HostSpec]:
         """
         Return all hosts that are offline or in maintenance mode.
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index f8d4398b56f6b..ef15bab72b180 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -295,7 +295,7 @@ class CephadmServe:
                 if r is not None:
                     bad_hosts.append(r)
 
-            if not self.mgr.use_agent:
+            if not self.mgr.use_agent or host not in [h.hostname for h in self.mgr._non_draining_hosts()]:
                 if self.mgr.cache.host_needs_daemon_refresh(host):
                     self.log.debug('refreshing %s daemons' % host)
                     r = self._refresh_host_daemons(host)
@@ -717,7 +717,7 @@ class CephadmServe:
             rank_map = self.mgr.spec_store[spec.service_name()].rank_map or {}
         ha = HostAssignment(
             spec=spec,
-            hosts=self.mgr.inventory.all_specs() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(),
+            hosts=self.mgr._non_draining_hosts() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(),
             unreachable_hosts=self.mgr._unreachable_hosts(),
             daemons=daemons,
             networks=self.mgr.cache.networks,
@@ -908,7 +908,8 @@ class CephadmServe:
         finally:
             if self.mgr.use_agent:
                 # can only send ack to agents if we know for sure port they bound to
-                hosts_altered = set([h for h in hosts_altered if h in self.mgr.cache.agent_ports])
+                hosts_altered = set([h for h in hosts_altered if (h in self.mgr.cache.agent_ports and h in [
+                                    h2.hostname for h2 in self.mgr._non_draining_hosts()])])
                 self.mgr.agent_helpers._request_agent_acks(hosts_altered, increment=True)
 
         if r is None:
-- 
2.39.5