]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
node-proxy: raise ceph warning(s) if needed
authorGuillaume Abrioux <gabrioux@ibm.com>
Tue, 19 Sep 2023 11:49:44 +0000 (11:49 +0000)
committerGuillaume Abrioux <gabrioux@ibm.com>
Thu, 25 Jan 2024 14:54:44 +0000 (14:54 +0000)
This makes the agent endpoint raise alert(s) when one or multiple
members of a component are critical.

Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
(cherry picked from commit b45ba22920afbd1471ad3163157f7dc612e6a1f1)

src/pybind/mgr/cephadm/agent.py

index 40853f7806ef55f081946effb784bf8f587f7ef8..144780b65b4f9d96a95d23b50549395eb43920ab 100644 (file)
@@ -174,6 +174,8 @@ class HostData(Server):
             if self.validate_node_proxy_data(data):
                 self.mgr.set_store(f'node_proxy/data/{data["host"]}', json.dumps(data['data']))
                 self.mgr.log.warning(f"{data}")
+                self.raise_alert(data)
+
                 results['result'] = data
 
         if cherrypy.request.method == 'GET':
@@ -182,6 +184,50 @@ class HostData(Server):
                 results[host] = json.loads(v)
         return results
 
+    def get_nok_members(self,
+                        component: str,
+                        data: Dict[str, Any]) -> List[Dict[str, str]]:
+        nok_members: List[Dict[str, str]] = []
+
+        for member in data[component].keys():
+            # Force a fake error for testing purpose
+            if component == 'storage':
+                _status = 'critical'
+                state = "Fake error"
+            else:
+                _status = data[component][member]['status']['health'].lower()
+            if _status.lower() != 'ok':
+                # state = data[component][member]['status']['state']
+                _member = dict(
+                    member=member,
+                    status=_status,
+                    state=state
+                )
+                nok_members.append(_member)
+
+        return nok_members
+
+    def raise_alert(self, data: Dict[str, Any]) -> None:
+        mapping: Dict[str, str] = {
+            'storage': 'NODE_PROXY_STORAGE',
+            'memory': 'NODE_PROXY_MEMORY',
+            'processors': 'NODE_PROXY_PROCESSORS',
+            'network': 'NODE_PROXY_NETWORK',
+        }
+
+        for component in data['data'].keys():
+            nok_members = self.get_nok_members(component,
+                                                data['data'])
+
+            if nok_members:
+                count = len(nok_members)
+                self.mgr.set_health_warning(
+                    mapping[component],
+                    summary=f'{count} {component} member{"s" if count > 1 else ""} {"are" if count > 1 else "is"} not ok',
+                    count=count,
+                    detail=[f"{member['member']} is {member['status']}: {member['state']}" for member in nok_members],
+                )
+
     def check_request_fields(self, data: Dict[str, Any]) -> None:
         fields = '{' + ', '.join([key for key in data.keys()]) + '}'
         if 'host' not in data: