From: Adam King Date: Mon, 11 Oct 2021 19:07:49 +0000 (-0400) Subject: mgr/cephadm: update CEPHADM_AGENT_DOWN when agent reports in X-Git-Tag: v17.1.0~641^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F43489%2Fhead;p=ceph.git mgr/cephadm: update CEPHADM_AGENT_DOWN when agent reports in Right now if an agent considered down reports in the health warning will reamin until the serve loop runs again. This change will give better responsiveness on setting/removing this health warning. Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index ead4c0a3bee..bbed1918f74 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -21,7 +21,7 @@ from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.primitives import hashes, serialization from cryptography.hazmat.backends import default_backend -from typing import Any, Dict, Set, Tuple, TYPE_CHECKING +from typing import Any, Dict, List, Set, Tuple, TYPE_CHECKING if TYPE_CHECKING: from cephadm.module import CephadmOrchestrator @@ -164,6 +164,12 @@ class HostData: # update timestamp of most recent agent update self.mgr.cache.agent_timestamp[host] = datetime_now() + agents_down = [] + for h in self.mgr.cache.get_hosts(): + if self.mgr.agent_helpers._agent_down(h): + agents_down.append(h) + self.mgr.agent_helpers._update_agent_down_healthcheck(agents_down) + up_to_date = False int_ack = int(data['ack']) @@ -261,6 +267,7 @@ class AgentMessageThread(threading.Thread): secure_agent_socket.sendall(msg.encode('utf-8')) agent_response = secure_agent_socket.recv(1024).decode() self.mgr.log.info(f'Received "{agent_response}" from agent on host {self.host}') + self.mgr.cache.sending_agent_message[self.host] = False return except ConnectionError as e: # if it's a connection error, possibly try to connect again. @@ -312,6 +319,24 @@ class CephadmAgentHelpers: return True return False + def _update_agent_down_healthcheck(self, down_agent_hosts: List[str]) -> None: + if 'CEPHADM_AGENT_DOWN' in self.mgr.health_checks: + del self.mgr.health_checks['CEPHADM_AGENT_DOWN'] + if down_agent_hosts: + detail: List[str] = [] + for agent in down_agent_hosts: + detail.append((f'Cephadm agent on host {agent} has not reported in ' + f'{2.5 * self.mgr.agent_refresh_rate} seconds. Agent is assumed ' + 'down and host may be offline.')) + self.mgr.health_checks['CEPHADM_AGENT_DOWN'] = { + 'severity': 'warning', + 'summary': '%d Cephadm Agent(s) are not reporting. ' + 'Hosts may be offline' % (len(down_agent_hosts)), + 'count': len(down_agent_hosts), + 'detail': detail, + } + self.mgr.set_health_checks(self.mgr.health_checks) + # this function probably seems very unnecessary, but it makes it considerably easier # to get the unit tests working. All unit tests that check which daemons were deployed # or services setup would have to be individually changed to expect an agent service or diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index e3e9c570883..a51aac9abc6 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -366,22 +366,7 @@ class CephadmServe: refresh(self.mgr.cache.get_hosts()) - if 'CEPHADM_AGENT_DOWN' in self.mgr.health_checks: - del self.mgr.health_checks['CEPHADM_AGENT_DOWN'] - if agents_down: - detail: List[str] = [] - for agent in agents_down: - detail.append((f'Cephadm agent on host {agent} has not reported in ' - f'{2.5 * self.mgr.agent_refresh_rate} seconds. Agent is assumed ' - 'down and host may be offline.')) - self.mgr.health_checks['CEPHADM_AGENT_DOWN'] = { - 'severity': 'warning', - 'summary': '%d Cephadm Agent(s) are not reporting. ' - 'Hosts may be offline' % (len(agents_down)), - 'count': len(agents_down), - 'detail': detail, - } - self.mgr.set_health_checks(self.mgr.health_checks) + self.mgr.agent_helpers._update_agent_down_healthcheck(agents_down) self.mgr.config_checker.run_checks()