From f15e1ec2559ee972689c91d6d3660ee0c37f3330 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 1 Sep 2021 11:03:16 -0400 Subject: [PATCH] mgr/cephadm: better handling of offline hosts with agent Signed-off-by: Adam King --- src/pybind/mgr/cephadm/agent.py | 8 +++++++- src/pybind/mgr/cephadm/inventory.py | 7 ++++++- src/pybind/mgr/cephadm/serve.py | 8 ++++---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index cdb42d4bc7293..9b16a4af7eb28 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -106,6 +106,9 @@ class HostData: except Exception as e: self.mgr.log.warning(f'Received bad metadata from an agent: {e}') else: + # if we got here, we've already verified the keyring of the agent. If + # host agent is reporting on is marked offline, it shouldn't be any more + self.mgr.offline_hosts_remove(data['host']) self.handle_metadata(data) def check_request_fields(self, data: Dict[str, Any]) -> None: @@ -271,9 +274,12 @@ class CephadmAgentHelpers: def _agent_down(self, host: str) -> bool: # if we don't have a timestamp, it's likely because of a mgr fail over. - # just set the timestamp to now + # just set the timestamp to now. However, if host was offline before, we + # should not allow creating a new timestamp to cause it to be marked online if host not in self.mgr.cache.agent_timestamp: self.mgr.cache.agent_timestamp[host] = datetime_now() + if host in self.mgr.offline_hosts: + return False # agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it. time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host] if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate): diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index 3ea558607aaa7..fd41ebb2c4f1f 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -947,7 +947,12 @@ class HostCache(): return True def all_host_metadata_up_to_date(self) -> bool: - if [h for h in self.get_hosts() if not self.host_metadata_up_to_date(h)]: + unreachables = [h.hostname for h in self.mgr._unreachable_hosts()] + if [h for h in self.get_hosts() if (not self.host_metadata_up_to_date(h) and h not in unreachables)]: + # this function is primarily for telling if it's safe to try and apply a service + # spec. Since offline/maintenance hosts aren't considered in that process anyway + # we don't want to return False if the host without up-to-date metadata is in one + # of those two categories. return False return True diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index e11a0e7e8e2fa..50c29e2f43902 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -266,6 +266,8 @@ class CephadmServe: return if self.mgr.use_agent and self.mgr.agent_helpers._agent_down(host): + agents_down.append(host) + self.mgr.cache.metadata_up_to_date[host] = False if host in self.mgr.offline_hosts: return self.mgr.offline_hosts.add(host) @@ -273,9 +275,9 @@ class CephadmServe: # a long timeout trying to use an existing connection to an offline host # REVISIT AFTER https://github.com/ceph/ceph/pull/42919 # self.mgr.ssh._reset_con(host) - agents_down.append(host) - # try to schedule redeploy of agent in case it is individually down + try: + # try to schedule redeploy of agent in case it is individually down agent = [a for a in self.mgr.cache.get_daemons_by_host( host) if a.hostname == host][0] self.mgr._schedule_daemon_action(agent.name(), 'redeploy') @@ -283,8 +285,6 @@ class CephadmServe: self.log.debug( f'Failed to find entry for agent deployed on host {host}. Agent possibly never deployed: {e}') return - elif self.mgr.use_agent: - self.mgr.offline_hosts_remove(host) if self.mgr.cache.host_needs_check(host): r = self._check_host(host) -- 2.39.5