From 9ed7f931c68f4d3ca460755ef844d810dc5c62ac Mon Sep 17 00:00:00 2001 From: Ankush Behl Date: Fri, 8 Aug 2025 18:20:45 +0530 Subject: [PATCH] prometheus: Add Cephadm orch ps output metric to prometheus Fixes: https://tracker.ceph.com/issues/72496 Signed-off-by: Ankush Behl --- src/pybind/mgr/prometheus/module.py | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 41b62fd6a93..a932d3d149b 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -115,6 +115,8 @@ NUM_OBJECTS = ['degraded', 'misplaced', 'unfound'] SMB_METADATA = ('smb_version', 'volume', 'subvolume_group', 'subvolume', 'netbiosname', 'share') +CEPHADM_DAEMON_STATUS = ('service_type', 'daemon_name', 'hostname', 'service_name') + alert_metric = namedtuple('alert_metric', 'name description') HEALTH_CHECKS = [ alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'), @@ -803,6 +805,12 @@ class Module(MgrModule, OrchestratorClientMixin): 'SMB Metadata', SMB_METADATA ) + metrics['cephadm_daemon_status'] = Metric( + 'gauge', + 'cephadm_daemon_status', + 'Status of cephadm daemons (0=stopped, 1=running, 2=errored)', + CEPHADM_DAEMON_STATUS + ) for flag in OSD_FLAGS: path = 'osd_flag_{}'.format(flag) @@ -993,6 +1001,30 @@ class Module(MgrModule, OrchestratorClientMixin): (pool['pool_id'],) ) + @profile_method() + def set_cephadm_daemon_status_metrics(self) -> None: + try: + daemons = raise_if_exception(self.list_daemons()) + for daemon in daemons: + service_type = getattr(daemon, 'daemon_type', '') + daemon_name = getattr(daemon, 'daemon_name', '') + hostname = str(getattr(daemon, 'hostname', '')) + status = getattr(daemon, 'status', '') + service_name_attr = getattr(daemon, 'service_name', '') + service_name = service_name_attr() if callable(service_name_attr) else str(service_name_attr) + + self.metrics['cephadm_daemon_status'].set( + int(status), + ( + service_type, + daemon_name, + hostname, + service_name, + ) + ) + except Exception as e: + self.log.error(f"Failed to collect cephadm daemon status: {e}") + @profile_method() def get_df(self) -> None: # maybe get the to-be-exported metrics from a config? @@ -1840,6 +1872,7 @@ class Module(MgrModule, OrchestratorClientMixin): self.get_num_objects() self.get_all_daemon_health_metrics() self.get_smb_metadata() + self.set_cephadm_daemon_status_metrics() if not self.get_module_option('exclude_perf_counters'): self.get_perf_counters() -- 2.39.5