]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: safe status/health access in node-proxy agent and inventory
authorGuillaume Abrioux <gabrioux@ibm.com>
Thu, 29 Jan 2026 12:14:40 +0000 (13:14 +0100)
committerGuillaume Abrioux <gabrioux@ibm.com>
Thu, 19 Feb 2026 12:51:31 +0000 (12:51 +0000)
This adds helpers in NodeProxyEndpoint and NodeProxyCache to safely
read status.health and status.state.

In NodeProxyEndpoint, methods _get_health_value() and _get_state_value()
are used in get_nok_members() to avoid KeyError on malformed data.

In NodeProxyCache, _get_health_value(), _has_health_value(),
_is_error_status(), and _is_unknown_status() are used in fullreport()
and when filtering 'non ok' members instead of accessing
status['status']['health'] inline.

Fixes: https://tracker.ceph.com/issues/74749
Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
(cherry picked from commit 576817410ffa41afda6b7d0b971b61d8f6a432ce)

src/pybind/mgr/cephadm/agent.py
src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/utils.py

index 11b33427ec7475fe14160866f9624245fcec3c64..a27e190c920d6500503563b297f95c69a76bade6 100644 (file)
@@ -23,6 +23,7 @@ from mgr_util import test_port_allocation, PortAlreadyInUse
 from mgr_util import verify_tls_files
 import tempfile
 from cephadm.services.service_registry import service_registry
+from cephadm.utils import get_node_proxy_status_value
 
 from urllib.error import HTTPError, URLError
 from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional, MutableMapping, IO
@@ -174,6 +175,12 @@ class NodeProxyEndpoint:
         except AttributeError:
             raise cherrypy.HTTPError(400, 'Malformed data received.')
 
+    def _get_health_value(self, member_data: Any) -> str:
+        return get_node_proxy_status_value(member_data, 'health', lower=True)
+
+    def _get_state_value(self, member_data: Any) -> str:
+        return get_node_proxy_status_value(member_data, 'state')
+
     # TODO(guits): refactor this
     # TODO(guits): use self.node_proxy.get_critical_from_host() ?
     def get_nok_members(self,
@@ -194,9 +201,10 @@ class NodeProxyEndpoint:
 
         for sys_id in data.keys():
             for member in data[sys_id].keys():
-                _status = data[sys_id][member]['status']['health'].lower()
-                if _status.lower() != 'ok':
-                    state = data[sys_id][member]['status']['state']
+                member_data = data[sys_id][member]
+                _status = self._get_health_value(member_data)
+                if _status and _status != 'ok':
+                    state = self._get_state_value(member_data)
                     _member = dict(
                         sys_id=sys_id,
                         member=member,
index 713b9ff10778dd57e17aa7b50cddfe88bbd34f10..b32bd1896cf0869d5a40cefdf0e9809c71ad81ae 100644 (file)
@@ -25,7 +25,7 @@ from ceph.utils import str_to_datetime, datetime_to_str, datetime_now
 from orchestrator import OrchestratorError, HostSpec, OrchestratorEvent, service_to_daemon_types
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 
-from .utils import resolve_ip, SpecialHostLabels
+from .utils import get_node_proxy_status_value, resolve_ip, SpecialHostLabels
 from .migrations import queue_migrate_nfs_spec, queue_migrate_rgw_spec
 
 if TYPE_CHECKING:
@@ -1641,6 +1641,18 @@ class NodeProxyCache:
         self.keyrings[host] = key
         self.mgr.set_store(f'{NODE_PROXY_CACHE_PREFIX}/keyrings', json.dumps(self.keyrings))
 
+    def _get_health_value(self, status: Any) -> str:
+        return get_node_proxy_status_value(status, 'health', lower=True)
+
+    def _has_health_value(self, statuses: ValuesView, health_value: str) -> bool:
+        return any([self._get_health_value(status) == health_value for status in statuses])
+
+    def _is_error_status(self, statuses: ValuesView) -> bool:
+        return self._has_health_value(statuses, 'error')
+
+    def _is_unknown_status(self, statuses: ValuesView) -> bool:
+        return self._has_health_value(statuses, 'unknown') and not self._is_error_status(statuses)
+
     def fullreport(self, **kw: Any) -> Dict[str, Any]:
         """
         Retrieves the full report for the specified hostname.
@@ -1678,12 +1690,6 @@ class NodeProxyCache:
         hostname = kw.get('hostname')
         hosts = [hostname] if hostname else self.data.keys()
 
-        def is_unknown(statuses: ValuesView) -> bool:
-            return any([status['status']['health'].lower() == 'unknown' for status in statuses]) and not is_error(statuses)
-
-        def is_error(statuses: ValuesView) -> bool:
-            return any([status['status']['health'].lower() == 'error' for status in statuses])
-
         _result: Dict[str, Any] = {}
 
         for host in hosts:
@@ -1695,9 +1701,9 @@ class NodeProxyCache:
                 _sys_id_res: List[str] = []
                 for element in details.values():
                     values = element.values()
-                    if is_error(values):
+                    if self._is_error_status(values):
                         state = 'error'
-                    elif is_unknown(values) or not values:
+                    elif self._is_unknown_status(values) or not values:
                         state = 'unknown'
                     else:
                         state = 'ok'
@@ -1772,7 +1778,7 @@ class NodeProxyCache:
                     if component_name == 'memory':
                         data_member['status']['health'] = 'critical'
                         data_member['status']['state'] = 'errors detected'
-                    if data_member['status']['health'].lower() != 'ok':
+                    if self._get_health_value(data_member) != 'ok':
                         results[component_name][member] = data_member
         return results
 
index 5d09518da4c4d9812968590ecd0ee76584d4f145..c91e5a9bc9ea392e0582a643fca318c886b3bffb 100644 (file)
@@ -161,3 +161,15 @@ def md5_hash(input_value: str) -> str:
     input_str = str(input_value).encode('utf-8')
     hash_object = hashlib.md5(input_str)
     return hash_object.hexdigest()
+
+
+def get_node_proxy_status_value(data: Any, key: str, lower: bool = False) -> str:
+    if not isinstance(data, dict):
+        return ''
+    status = data.get('status', {})
+    if not isinstance(status, dict):
+        return ''
+    value = status.get(key, '')
+    if not isinstance(value, str):
+        return ''
+    return value.lower() if lower else value