From 0bd7698c455122dc5dc6e2e06dc88fd5b7a7ca93 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Tue, 19 Sep 2023 11:49:44 +0000 Subject: [PATCH] node-proxy: raise ceph warning(s) if needed This makes the agent endpoint raise alert(s) when one or multiple members of a component are critical. Signed-off-by: Guillaume Abrioux (cherry picked from commit b45ba22920afbd1471ad3163157f7dc612e6a1f1) --- src/pybind/mgr/cephadm/agent.py | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index 40853f7806ef5..144780b65b4f9 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -174,6 +174,8 @@ class HostData(Server): if self.validate_node_proxy_data(data): self.mgr.set_store(f'node_proxy/data/{data["host"]}', json.dumps(data['data'])) self.mgr.log.warning(f"{data}") + self.raise_alert(data) + results['result'] = data if cherrypy.request.method == 'GET': @@ -182,6 +184,50 @@ class HostData(Server): results[host] = json.loads(v) return results + def get_nok_members(self, + component: str, + data: Dict[str, Any]) -> List[Dict[str, str]]: + nok_members: List[Dict[str, str]] = [] + + for member in data[component].keys(): + # Force a fake error for testing purpose + if component == 'storage': + _status = 'critical' + state = "Fake error" + else: + _status = data[component][member]['status']['health'].lower() + if _status.lower() != 'ok': + # state = data[component][member]['status']['state'] + _member = dict( + member=member, + status=_status, + state=state + ) + nok_members.append(_member) + + return nok_members + + def raise_alert(self, data: Dict[str, Any]) -> None: + mapping: Dict[str, str] = { + 'storage': 'NODE_PROXY_STORAGE', + 'memory': 'NODE_PROXY_MEMORY', + 'processors': 'NODE_PROXY_PROCESSORS', + 'network': 'NODE_PROXY_NETWORK', + } + + for component in data['data'].keys(): + nok_members = self.get_nok_members(component, + data['data']) + + if nok_members: + count = len(nok_members) + self.mgr.set_health_warning( + mapping[component], + summary=f'{count} {component} member{"s" if count > 1 else ""} {"are" if count > 1 else "is"} not ok', + count=count, + detail=[f"{member['member']} is {member['status']}: {member['state']}" for member in nok_members], + ) + def check_request_fields(self, data: Dict[str, Any]) -> None: fields = '{' + ', '.join([key for key in data.keys()]) + '}' if 'host' not in data: -- 2.39.5