mgr/cephadm: fix host drain with agent

author Adam King <adking@redhat.com>

Tue, 28 Sep 2021 12:50:34 +0000 (08:50 -0400)

committer Adam King <adking@redhat.com>

Tue, 28 Sep 2021 15:43:10 +0000 (11:43 -0400)
author Adam King <adking@redhat.com>
Tue, 28 Sep 2021 12:50:34 +0000 (08:50 -0400)
committer Adam King <adking@redhat.com>
Tue, 28 Sep 2021 15:43:10 +0000 (11:43 -0400)
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py

index d43f639264855b225f4a5464a8fe969ef6952b86..cd9cd199f114c0b26c16a4ef984e2fce1730f9b6 100644 (file)
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -295,6 +295,10 @@ class CephadmAgentHelpers:
              message_thread.start()
  
      def _agent_down(self, host: str) -> bool:
+        # if host is draining or drained (has _no_schedule label) there should not
+        # be an agent deployed there and therefore we should return False
+        if host not in [h.hostname for h in self.mgr._non_draining_hosts()]:
+            return False
          # if we don't have a timestamp, it's likely because of a mgr fail over.
          # just set the timestamp to now. However, if host was offline before, we
          # should not allow creating a new timestamp to cause it to be marked online
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 5ddfa436f1a864fac4d684dffc5a0fa631ddf214..3cc37845754a946e6ef35a5fe8aa071bcf2593cd 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1391,6 +1391,18 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
              )
          ]
  
+    def _non_draining_hosts(self) -> List[HostSpec]:
+        """
+        Returns all hosts that do not have _no_schedule label.
+
+        Useful for the agent who needs this specific list rather than the
+        _schedulable_hosts since the agent needs to be deployed on hosts with
+        no daemon refresh
+        """
+        return [
+            h for h in self.inventory.all_specs() if '_no_schedule' not in h.labels
+        ]
+
      def _unreachable_hosts(self) -> List[HostSpec]:
          """
          Return all hosts that are offline or in maintenance mode.
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py

index f8d4398b56f6b379082d8f517d18ed6fb6af33dd..ef15bab72b18084d91532b0b1a9d135a71d7d891 100644 (file)
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -295,7 +295,7 @@ class CephadmServe:
                  if r is not None:
                      bad_hosts.append(r)
  
-            if not self.mgr.use_agent:
+            if not self.mgr.use_agent or host not in [h.hostname for h in self.mgr._non_draining_hosts()]:
                  if self.mgr.cache.host_needs_daemon_refresh(host):
                      self.log.debug('refreshing %s daemons' % host)
                      r = self._refresh_host_daemons(host)
@@ -717,7 +717,7 @@ class CephadmServe:
              rank_map = self.mgr.spec_store[spec.service_name()].rank_map or {}
          ha = HostAssignment(
              spec=spec,
-            hosts=self.mgr.inventory.all_specs() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(),
+            hosts=self.mgr._non_draining_hosts() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(),
              unreachable_hosts=self.mgr._unreachable_hosts(),
              daemons=daemons,
              networks=self.mgr.cache.networks,
@@ -908,7 +908,8 @@ class CephadmServe:
          finally:
              if self.mgr.use_agent:
                  # can only send ack to agents if we know for sure port they bound to
-                hosts_altered = set([h for h in hosts_altered if h in self.mgr.cache.agent_ports])
+                hosts_altered = set([h for h in hosts_altered if (h in self.mgr.cache.agent_ports and h in [
+                                    h2.hostname for h2 in self.mgr._non_draining_hosts()])])
                  self.mgr.agent_helpers._request_agent_acks(hosts_altered, increment=True)
  
          if r is None:
author	Adam King <adking@redhat.com>
	Tue, 28 Sep 2021 12:50:34 +0000 (08:50 -0400)
committer	Adam King <adking@redhat.com>
	Tue, 28 Sep 2021 15:43:10 +0000 (11:43 -0400)
src/pybind/mgr/cephadm/agent.py		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/cephadm/serve.py		patch \| blob \| history