From: Abhishek Desai <abhishek.desai1@ibm.com>
Date: Thu, 30 Oct 2025 04:40:27 +0000 (+0530)
Subject: mgr/dashboard :  Add certmgr alerts and warings to Prometheus and dashboard
X-Git-Tag: v21.0.0~220^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4b5096cc0de40a1b80bf801b4261bb58b0702e82;p=ceph.git

mgr/dashboard :  Add certmgr alerts and warings to Prometheus and dashboard
fixes : https://tracker.ceph.com/issues/73674
Signed-off-by: Abhishek Desai <abhishek.desai1@ibm.com>

New changes commit for certmgr alerts
---

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 446297dad617..ac5893a00e14 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -12,6 +12,9 @@
   adjusting FEATURE_TOGGLE_DASHBOARD.
 * DASHBOARD: RGW Service form updated to take input regarding QAT compression.
   - QAT compression is an optional field which can be set to 'Hardware' or 'Software' by selecting options from provided dropdwon. If 'None' is selected, compression is removed altogether.
+* Monitoring: Added new Prometheus alerts for certificate management:
+  - CephCertificateError: Fires when a Ceph certificate has expired (critical severity).
+  - CephCertificateWarning: Fires when a Ceph certificate is about to expire (warning severity).
 * CephFS: The `peer_add` command is deprecated in favor of the `peer_bootstrap` command.
 * RADOS: When objects are read during deep scrubs, the data is read in strides,
   and the scrubbing process is delayed between each read in order to avoid monopolizing
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index 05bbcb9c2add..9c1c3db4375e 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -1047,5 +1047,30 @@
         },
       ],
     },
+    {
+      name: 'certmgr',
+      rules: [
+        {
+          alert: 'CephCertificateError',
+          'for': '1m',
+          expr: 'ceph_health_detail{name="CEPHADM_CERT_ERROR"} == 1',
+          labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.1' },
+          annotations: {
+            summary: 'Ceph certificate error detected%(cluster)s' % $.MultiClusterSummary(),
+            description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.",
+          },
+        },
+        {
+          alert: 'CephCertificateWarning',
+          'for': '1m',
+          expr: 'ceph_health_detail{name="CEPHADM_CERT_WARNING"} == 1',
+          labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.2' },
+          annotations: {
+            summary: 'Ceph certificate warning detected%(cluster)s' % $.MultiClusterSummary(),
+            description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.",
+          },
+        },
+      ],
+    },
   ],
 }
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 87631e81296a..8a6c411a2b63 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -973,3 +973,26 @@ groups:
         labels:
           severity: "warning"
           type: "ceph_default"
+  - name: "certmgr"
+    rules:
+      - alert: "CephCertificateError"
+        annotations:
+          description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+          summary: "Ceph certificate error detected on cluster {{ $labels.cluster }}"
+        expr: "ceph_health_detail{name=\"CEPHADM_CERT_ERROR\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.15.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephCertificateWarning"
+        annotations:
+          description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+          summary: "Ceph certificate warning detected on cluster {{ $labels.cluster }}"
+        expr: "ceph_health_detail{name=\"CEPHADM_CERT_WARNING\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.15.2"
+          severity: "warning"
+          type: "ceph_default"
+
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 131d9811d562..5578544c934f 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -3209,3 +3209,59 @@ tests:
         exp_annotations:
           summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
           description: "Host was disconnected due to host keep alive timeout"
+
+# Certificate Management - Error Alert
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="CEPHADM_CERT_ERROR"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephCertificateError
+    - eval_time: 5m
+      alertname: CephCertificateError
+      exp_alerts:
+      - exp_labels:
+          name: CEPHADM_CERT_ERROR
+          severity: critical
+          cluster: mycluster
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.15.1
+          message: Certificate has expired
+        exp_annotations:
+          summary: Ceph certificate error detected on cluster mycluster
+          description: "Certificate has expired. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+
+# Certificate Management - Warning Alert
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="CEPHADM_CERT_WARNING"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephCertificateWarning
+    - eval_time: 5m
+      alertname: CephCertificateWarning
+      exp_alerts:
+      - exp_labels:
+          name: CEPHADM_CERT_WARNING
+          severity: warning
+          cluster: mycluster
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.15.2
+          message: Certificate expires soon
+        exp_annotations:
+          summary: Ceph certificate warning detected on cluster mycluster
+          description: "Certificate expires soon. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py
index 26b0da41d6d0..5c91d74fa2f5 100644
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -149,7 +149,7 @@ HEALTH_CHECKS = [
     alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'),
 ]
 
-HEALTHCHECK_DETAIL = ('name', 'severity')
+HEALTHCHECK_DETAIL = ('name', 'severity', 'message')
 
 
 class Severity(enum.Enum):
@@ -979,6 +979,30 @@ class Module(MgrModule, OrchestratorClientMixin):
         self.log.info('Config changed, signaling serve loop to restart engine')
         self.config_change_event.set()
 
+    def _process_cert_health_detail(self, alert_id: str, health_data: dict) -> None:
+        """Process certificate health check details and set metrics."""
+        severity = health_data.get('severity', 'unknown')
+        detail_messages = health_data.get('detail', [])
+        if not detail_messages:
+            return
+
+        for detail_entry in detail_messages:
+            message = detail_entry.get('message', '')
+            if not message:
+                continue
+
+            try:
+                self.metrics['health_detail'].set(
+                    1,
+                    (
+                        alert_id,
+                        str(severity),
+                        str(message)
+                    )
+                )
+            except Exception as e:
+                self.log.error(f"Failed to process {alert_id} message '{message}': {e}")
+
     @profile_method()
     def get_health(self) -> None:
 
@@ -1028,13 +1052,21 @@ class Module(MgrModule, OrchestratorClientMixin):
                     # health check is not active, so give it a default of 0
                     self.metrics[path].set(0)
 
+        for alert_id in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'):
+            if alert_id in active_names:
+                self._process_cert_health_detail(alert_id, active_healthchecks[alert_id])
+
         self.health_history.check(health)
         for name, info in self.health_history.healthcheck.items():
+            # Skip CEPHADM_CERT_ERROR and CEPHADM_CERT_WARNING as they're handled specially above with message details
+            if name in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'):
+                continue
             v = 1 if info.active else 0
             self.metrics['health_detail'].set(
                 v, (
                     name,
-                    str(info.severity))
+                    str(info.severity),
+                    '')
             )
 
     @profile_method()