From: Adam King Date: Tue, 28 Sep 2021 12:50:34 +0000 (-0400) Subject: mgr/cephadm: fix host drain with agent X-Git-Tag: v17.1.0~739^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=20989eae5760ae7928bab1c7e55488b6cea8acee;p=ceph-ci.git mgr/cephadm: fix host drain with agent Agent was not getting removed from hosts with _no_schedule label since it was using all hosts rather than _schedulable hosts. Added a _non_draining hosts function to return a prope list of hosts agent is okay to schedule on. Was stuck between using a daemon to report which daemons are on the host and wanting to remove all the daemons on the host when draining. Now using ssh to confirm all the daemons have left the host. Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index d43f6392648..cd9cd199f11 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -295,6 +295,10 @@ class CephadmAgentHelpers: message_thread.start() def _agent_down(self, host: str) -> bool: + # if host is draining or drained (has _no_schedule label) there should not + # be an agent deployed there and therefore we should return False + if host not in [h.hostname for h in self.mgr._non_draining_hosts()]: + return False # if we don't have a timestamp, it's likely because of a mgr fail over. # just set the timestamp to now. However, if host was offline before, we # should not allow creating a new timestamp to cause it to be marked online diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 5ddfa436f1a..3cc37845754 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1391,6 +1391,18 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, ) ] + def _non_draining_hosts(self) -> List[HostSpec]: + """ + Returns all hosts that do not have _no_schedule label. + + Useful for the agent who needs this specific list rather than the + _schedulable_hosts since the agent needs to be deployed on hosts with + no daemon refresh + """ + return [ + h for h in self.inventory.all_specs() if '_no_schedule' not in h.labels + ] + def _unreachable_hosts(self) -> List[HostSpec]: """ Return all hosts that are offline or in maintenance mode. diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index f8d4398b56f..ef15bab72b1 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -295,7 +295,7 @@ class CephadmServe: if r is not None: bad_hosts.append(r) - if not self.mgr.use_agent: + if not self.mgr.use_agent or host not in [h.hostname for h in self.mgr._non_draining_hosts()]: if self.mgr.cache.host_needs_daemon_refresh(host): self.log.debug('refreshing %s daemons' % host) r = self._refresh_host_daemons(host) @@ -717,7 +717,7 @@ class CephadmServe: rank_map = self.mgr.spec_store[spec.service_name()].rank_map or {} ha = HostAssignment( spec=spec, - hosts=self.mgr.inventory.all_specs() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(), + hosts=self.mgr._non_draining_hosts() if spec.service_name() == 'agent' else self.mgr._schedulable_hosts(), unreachable_hosts=self.mgr._unreachable_hosts(), daemons=daemons, networks=self.mgr.cache.networks, @@ -908,7 +908,8 @@ class CephadmServe: finally: if self.mgr.use_agent: # can only send ack to agents if we know for sure port they bound to - hosts_altered = set([h for h in hosts_altered if h in self.mgr.cache.agent_ports]) + hosts_altered = set([h for h in hosts_altered if (h in self.mgr.cache.agent_ports and h in [ + h2.hostname for h2 in self.mgr._non_draining_hosts()])]) self.mgr.agent_helpers._request_agent_acks(hosts_altered, increment=True) if r is None: