]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
prometheus: Add Cephadm orch ps output metric to prometheus 64912/head
authorAnkush Behl <cloudbehl@gmail.com>
Fri, 8 Aug 2025 12:50:45 +0000 (18:20 +0530)
committerAnkush Behl <cloudbehl@gmail.com>
Fri, 26 Sep 2025 10:28:26 +0000 (15:58 +0530)
Fixes: https://tracker.ceph.com/issues/72496
Signed-off-by: Ankush Behl <cloudbehl@gmail.com>
src/pybind/mgr/prometheus/module.py

index 41b62fd6a93ad8101abb4cd1dd05eb289d4db515..a932d3d149bb36bc1c04719148fd7fae3cc5b03a 100644 (file)
@@ -115,6 +115,8 @@ NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
 SMB_METADATA = ('smb_version', 'volume',
                 'subvolume_group', 'subvolume', 'netbiosname', 'share')
 
+CEPHADM_DAEMON_STATUS = ('service_type', 'daemon_name', 'hostname', 'service_name')
+
 alert_metric = namedtuple('alert_metric', 'name description')
 HEALTH_CHECKS = [
     alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'),
@@ -803,6 +805,12 @@ class Module(MgrModule, OrchestratorClientMixin):
             'SMB Metadata',
             SMB_METADATA
         )
+        metrics['cephadm_daemon_status'] = Metric(
+            'gauge',
+            'cephadm_daemon_status',
+            'Status of cephadm daemons (0=stopped, 1=running, 2=errored)',
+            CEPHADM_DAEMON_STATUS
+        )
 
         for flag in OSD_FLAGS:
             path = 'osd_flag_{}'.format(flag)
@@ -993,6 +1001,30 @@ class Module(MgrModule, OrchestratorClientMixin):
                     (pool['pool_id'],)
                 )
 
+    @profile_method()
+    def set_cephadm_daemon_status_metrics(self) -> None:
+        try:
+            daemons = raise_if_exception(self.list_daemons())
+            for daemon in daemons:
+                service_type = getattr(daemon, 'daemon_type', '')
+                daemon_name = getattr(daemon, 'daemon_name', '')
+                hostname = str(getattr(daemon, 'hostname', ''))
+                status = getattr(daemon, 'status', '')
+                service_name_attr = getattr(daemon, 'service_name', '')
+                service_name = service_name_attr() if callable(service_name_attr) else str(service_name_attr)
+
+                self.metrics['cephadm_daemon_status'].set(
+                    int(status),
+                    (
+                        service_type,
+                        daemon_name,
+                        hostname,
+                        service_name,
+                    )
+                )
+        except Exception as e:
+            self.log.error(f"Failed to collect cephadm daemon status: {e}")
+
     @profile_method()
     def get_df(self) -> None:
         # maybe get the to-be-exported metrics from a config?
@@ -1840,6 +1872,7 @@ class Module(MgrModule, OrchestratorClientMixin):
         self.get_num_objects()
         self.get_all_daemon_health_metrics()
         self.get_smb_metadata()
+        self.set_cephadm_daemon_status_metrics()
 
         if not self.get_module_option('exclude_perf_counters'):
             self.get_perf_counters()