From aa2718ac00e013a45ef2a597a2d2545b348b3a15 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Mon, 30 Oct 2023 15:51:56 +0000 Subject: [PATCH] node-proxy: implement criticals endpoint This adds the required changes in order to implement the endpoint '/criticals'. The goal of this endpoint is to provide a report of all critical statuses for either a given host or all hosts across the cluster. Signed-off-by: Guillaume Abrioux (cherry picked from commit ae791f8721027a9a508c7cd27e85f86f6fe7c492) --- src/pybind/mgr/cephadm/agent.py | 24 +++++++----------- src/pybind/mgr/cephadm/inventory.py | 39 +++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index fb45d01f46bcf..39b0cb3d0ef88 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -142,23 +142,16 @@ class NodeProxy: except AttributeError: raise cherrypy.HTTPError(400, 'Malformed data received.') + # TODO(guits): refactor this + # TODO(guits): use self.node_proxy.get_critical_from_host() ? def get_nok_members(self, - component: str, data: Dict[str, Any]) -> List[Dict[str, str]]: nok_members: List[Dict[str, str]] = [] for member in data.keys(): - # Force a fake error for testing purpose - if component == 'storage': - _status = 'critical' - state = "[Fake error] device is faulty." - elif component == 'power': - _status = 'critical' - state = "[Fake error] power supply unplugged." - else: - _status = data[member]['status']['health'].lower() + _status = data[member]['status']['health'].lower() if _status.lower() != 'ok': - # state = data[member]['status']['state'] + state = data[member]['status']['state'] _member = dict( member=member, status=_status, @@ -179,13 +172,14 @@ class NodeProxy: } for component in data['patch']['status'].keys(): - self.mgr.remove_health_warning(mapping[component]) - nok_members = self.get_nok_members(component, data['patch']['status'][component]) + alert_name = f"HARDWARE_{component.upper()}" + self.mgr.remove_health_warning(alert_name) + nok_members = self.get_nok_members(data['patch']['status'][component]) if nok_members: count = len(nok_members) self.mgr.set_health_warning( - mapping[component], + alert_name, summary=f'{count} {component} member{"s" if count > 1 else ""} {"are" if count > 1 else "is"} not ok', count=count, detail=[f"{member['member']} is {member['status']}: {member['state']}" for member in nok_members], @@ -285,7 +279,7 @@ class NodeProxy: @cherrypy.tools.allow(methods=['GET']) @cherrypy.tools.json_out() def criticals(self, **kw: Any) -> Dict[str, Any]: - return self.mgr.node_proxy.criticals() + return self.mgr.node_proxy.criticals(**kw) @cherrypy.expose @cherrypy.tools.allow(methods=['GET']) diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index 401d675885c5d..31029f0a7ee51 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -1499,8 +1499,43 @@ class NodeProxyCache: return {host: self.data[host]['firmwares'] for host in hosts} - def criticals(self, **kw): - return {} + def get_critical_from_host(self, hostname: str) -> Dict[str, Any]: + results: Dict[str, Any] = {} + for component, data_component in self.data[hostname]['status'].items(): + if component not in results.keys(): + results[component] = {} + for member, data_member in data_component.items(): + if component == 'power': + data_member['status']['health'] = 'critical' + data_member['status']['state'] = 'unplugged' + if component == 'memory': + data_member['status']['health'] = 'critical' + data_member['status']['state'] = 'errors detected' + if data_member['status']['health'].lower() != 'ok': + results[component][member] = data_member + return results + + def criticals(self, **kw: Any) -> Dict[str, Any]: + """ + Retrieves critical information for a specific hostname or all hosts. + + If a 'hostname' is provided in the keyword arguments, retrieves critical + information for that specific host. Otherwise, retrieves critical + information for all available hosts. + + :param kw: Keyword arguments, including 'hostname' if specified. + :type kw: dict + + :return: A dictionary containing critical information for each host. + :rtype: List[Dict[str, Any]] + """ + hostname = kw.get('hostname') + results: Dict[str, Any] = {} + + hosts = [hostname] if hostname else self.data.keys() + for host in hosts: + results[host] = self.get_critical_from_host(host) + return results class AgentCache(): -- 2.39.5