From: Guillaume Abrioux Date: Thu, 29 Jan 2026 12:14:40 +0000 (+0100) Subject: mgr/cephadm: safe status/health access in node-proxy agent and inventory X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=576817410ffa41afda6b7d0b971b61d8f6a432ce;p=ceph.git mgr/cephadm: safe status/health access in node-proxy agent and inventory This adds helpers in NodeProxyEndpoint and NodeProxyCache to safely read status.health and status.state. In NodeProxyEndpoint, methods _get_health_value() and _get_state_value() are used in get_nok_members() to avoid KeyError on malformed data. In NodeProxyCache, _get_health_value(), _has_health_value(), _is_error_status(), and _is_unknown_status() are used in fullreport() and when filtering 'non ok' members instead of accessing status['status']['health'] inline. Fixes: https://tracker.ceph.com/issues/74749 Signed-off-by: Guillaume Abrioux --- diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index cec4ab0ab81..8722bac4649 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -25,6 +25,7 @@ import tempfile from cephadm.services.service_registry import service_registry from cephadm.services.cephadmservice import CephadmAgent from cephadm.tlsobject_types import TLSCredentials +from cephadm.utils import get_node_proxy_status_value from urllib.error import HTTPError, URLError from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional, MutableMapping, IO @@ -186,6 +187,12 @@ class NodeProxyEndpoint: except AttributeError: raise cherrypy.HTTPError(400, 'Malformed data received.') + def _get_health_value(self, member_data: Any) -> str: + return get_node_proxy_status_value(member_data, 'health', lower=True) + + def _get_state_value(self, member_data: Any) -> str: + return get_node_proxy_status_value(member_data, 'state') + # TODO(guits): refactor this # TODO(guits): use self.node_proxy.get_critical_from_host() ? def get_nok_members(self, @@ -206,9 +213,10 @@ class NodeProxyEndpoint: for sys_id in data.keys(): for member in data[sys_id].keys(): - _status = data[sys_id][member]['status']['health'].lower() - if _status.lower() != 'ok': - state = data[sys_id][member]['status']['state'] + member_data = data[sys_id][member] + _status = self._get_health_value(member_data) + if _status and _status != 'ok': + state = self._get_state_value(member_data) _member = dict( sys_id=sys_id, member=member, diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index c4a99fbe88e..ab37954372a 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -26,7 +26,7 @@ from orchestrator import OrchestratorError, HostSpec, OrchestratorEvent, service from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from mgr_util import parse_combined_pem_file -from .utils import resolve_ip, SpecialHostLabels +from .utils import get_node_proxy_status_value, resolve_ip, SpecialHostLabels from .migrations import queue_migrate_nfs_spec, queue_migrate_rgw_spec if TYPE_CHECKING: @@ -1657,6 +1657,18 @@ class NodeProxyCache: self.keyrings[host] = key self.mgr.set_store(f'{NODE_PROXY_CACHE_PREFIX}/keyrings', json.dumps(self.keyrings)) + def _get_health_value(self, status: Any) -> str: + return get_node_proxy_status_value(status, 'health', lower=True) + + def _has_health_value(self, statuses: ValuesView, health_value: str) -> bool: + return any([self._get_health_value(status) == health_value for status in statuses]) + + def _is_error_status(self, statuses: ValuesView) -> bool: + return self._has_health_value(statuses, 'error') + + def _is_unknown_status(self, statuses: ValuesView) -> bool: + return self._has_health_value(statuses, 'unknown') and not self._is_error_status(statuses) + def fullreport(self, **kw: Any) -> Dict[str, Any]: """ Retrieves the full report for the specified hostname. @@ -1694,12 +1706,6 @@ class NodeProxyCache: hostname = kw.get('hostname') hosts = [hostname] if hostname else self.data.keys() - def is_unknown(statuses: ValuesView) -> bool: - return any([status['status']['health'].lower() == 'unknown' for status in statuses]) and not is_error(statuses) - - def is_error(statuses: ValuesView) -> bool: - return any([status['status']['health'].lower() == 'error' for status in statuses]) - _result: Dict[str, Any] = {} for host in hosts: @@ -1711,9 +1717,9 @@ class NodeProxyCache: _sys_id_res: List[str] = [] for element in details.values(): values = element.values() - if is_error(values): + if self._is_error_status(values): state = 'error' - elif is_unknown(values) or not values: + elif self._is_unknown_status(values) or not values: state = 'unknown' else: state = 'ok' @@ -1788,7 +1794,7 @@ class NodeProxyCache: if component_name == 'memory': data_member['status']['health'] = 'critical' data_member['status']['state'] = 'errors detected' - if data_member['status']['health'].lower() != 'ok': + if self._get_health_value(data_member) != 'ok': results[component_name][member] = data_member return results diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py index 9ef394e86dc..805ca953e91 100644 --- a/src/pybind/mgr/cephadm/utils.py +++ b/src/pybind/mgr/cephadm/utils.py @@ -162,3 +162,15 @@ def md5_hash(input_value: str) -> str: input_str = str(input_value).encode('utf-8') hash_object = hashlib.md5(input_str) return hash_object.hexdigest() + + +def get_node_proxy_status_value(data: Any, key: str, lower: bool = False) -> str: + if not isinstance(data, dict): + return '' + status = data.get('status', {}) + if not isinstance(status, dict): + return '' + value = status.get(key, '') + if not isinstance(value, str): + return '' + return value.lower() if lower else value