mgr/dashboard : Add certmgr alerts and warings to Prometheus and dashboard

author Abhishek Desai <abhishek.desai1@ibm.com>

Thu, 30 Oct 2025 04:40:27 +0000 (10:10 +0530)

committer Abhishek Desai <abhishek.desai1@ibm.com>

Mon, 23 Feb 2026 14:51:29 +0000 (20:21 +0530)
author Abhishek Desai <abhishek.desai1@ibm.com>
Thu, 30 Oct 2025 04:40:27 +0000 (10:10 +0530)
committer Abhishek Desai <abhishek.desai1@ibm.com>
Mon, 23 Feb 2026 14:51:29 +0000 (20:21 +0530)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 446297dad6179e1a3f8f3e387069f606ca6bd150..ac5893a00e145d784c831dc6af3e77888ae0a121 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -12,6 +12,9 @@
    adjusting FEATURE_TOGGLE_DASHBOARD.
  * DASHBOARD: RGW Service form updated to take input regarding QAT compression.
    - QAT compression is an optional field which can be set to 'Hardware' or 'Software' by selecting options from provided dropdwon. If 'None' is selected, compression is removed altogether.
+* Monitoring: Added new Prometheus alerts for certificate management:
+  - CephCertificateError: Fires when a Ceph certificate has expired (critical severity).
+  - CephCertificateWarning: Fires when a Ceph certificate is about to expire (warning severity).
  * CephFS: The `peer_add` command is deprecated in favor of the `peer_bootstrap` command.
  * RADOS: When objects are read during deep scrubs, the data is read in strides,
    and the scrubbing process is delayed between each read in order to avoid monopolizing
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet

index 05bbcb9c2add67f0b90e5d1e9f14fb28885d8370..9c1c3db4375e1ba5a5af81d96be33310c33474b6 100644 (file)
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -1047,5 +1047,30 @@
          },
        ],
      },
+    {
+      name: 'certmgr',
+      rules: [
+        {
+          alert: 'CephCertificateError',
+          'for': '1m',
+          expr: 'ceph_health_detail{name="CEPHADM_CERT_ERROR"} == 1',
+          labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.1' },
+          annotations: {
+            summary: 'Ceph certificate error detected%(cluster)s' % $.MultiClusterSummary(),
+            description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.",
+          },
+        },
+        {
+          alert: 'CephCertificateWarning',
+          'for': '1m',
+          expr: 'ceph_health_detail{name="CEPHADM_CERT_WARNING"} == 1',
+          labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.2' },
+          annotations: {
+            summary: 'Ceph certificate warning detected%(cluster)s' % $.MultiClusterSummary(),
+            description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.",
+          },
+        },
+      ],
+    },
    ],
  }
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml

index 87631e81296a2ba74da6e73668e372d5847503c9..8a6c411a2b63d9b2147e8d5256ce5b6905cff3f9 100644 (file)
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -973,3 +973,26 @@ groups:
          labels:
            severity: "warning"
            type: "ceph_default"
+  - name: "certmgr"
+    rules:
+      - alert: "CephCertificateError"
+        annotations:
+          description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+          summary: "Ceph certificate error detected on cluster {{ $labels.cluster }}"
+        expr: "ceph_health_detail{name=\"CEPHADM_CERT_ERROR\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.15.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephCertificateWarning"
+        annotations:
+          description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+          summary: "Ceph certificate warning detected on cluster {{ $labels.cluster }}"
+        expr: "ceph_health_detail{name=\"CEPHADM_CERT_WARNING\"} == 1"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.15.2"
+          severity: "warning"
+          type: "ceph_default"
+
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index 131d9811d562fadfc63ebf8e296a86b58dcaf6bc..5578544c934f70a5070ea88a1b90f613d4c2181b 100644 (file)
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -3209,3 +3209,59 @@ tests:
          exp_annotations:
            summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
            description: "Host was disconnected due to host keep alive timeout"
+
+# Certificate Management - Error Alert
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="CEPHADM_CERT_ERROR"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephCertificateError
+    - eval_time: 5m
+      alertname: CephCertificateError
+      exp_alerts:
+      - exp_labels:
+          name: CEPHADM_CERT_ERROR
+          severity: critical
+          cluster: mycluster
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.15.1
+          message: Certificate has expired
+        exp_annotations:
+          summary: Ceph certificate error detected on cluster mycluster
+          description: "Certificate has expired. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+
+# Certificate Management - Warning Alert
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="CEPHADM_CERT_WARNING"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephCertificateWarning
+    - eval_time: 5m
+      alertname: CephCertificateWarning
+      exp_alerts:
+      - exp_labels:
+          name: CEPHADM_CERT_WARNING
+          severity: warning
+          cluster: mycluster
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.15.2
+          message: Certificate expires soon
+        exp_annotations:
+          summary: Ceph certificate warning detected on cluster mycluster
+          description: "Certificate expires soon. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py

index 26b0da41d6d078c8cb390f048845f61647bff9a2..5c91d74fa2f5afab99e75e38d26b822f312cb477 100644 (file)
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -149,7 +149,7 @@ HEALTH_CHECKS = [
      alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'),
  ]
  
-HEALTHCHECK_DETAIL = ('name', 'severity')
+HEALTHCHECK_DETAIL = ('name', 'severity', 'message')
  
  
  class Severity(enum.Enum):
@@ -979,6 +979,30 @@ class Module(MgrModule, OrchestratorClientMixin):
          self.log.info('Config changed, signaling serve loop to restart engine')
          self.config_change_event.set()
  
+    def _process_cert_health_detail(self, alert_id: str, health_data: dict) -> None:
+        """Process certificate health check details and set metrics."""
+        severity = health_data.get('severity', 'unknown')
+        detail_messages = health_data.get('detail', [])
+        if not detail_messages:
+            return
+
+        for detail_entry in detail_messages:
+            message = detail_entry.get('message', '')
+            if not message:
+                continue
+
+            try:
+                self.metrics['health_detail'].set(
+                    1,
+                    (
+                        alert_id,
+                        str(severity),
+                        str(message)
+                    )
+                )
+            except Exception as e:
+                self.log.error(f"Failed to process {alert_id} message '{message}': {e}")
+
      @profile_method()
      def get_health(self) -> None:
  
@@ -1028,13 +1052,21 @@ class Module(MgrModule, OrchestratorClientMixin):
                      # health check is not active, so give it a default of 0
                      self.metrics[path].set(0)
  
+        for alert_id in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'):
+            if alert_id in active_names:
+                self._process_cert_health_detail(alert_id, active_healthchecks[alert_id])
+
          self.health_history.check(health)
          for name, info in self.health_history.healthcheck.items():
+            # Skip CEPHADM_CERT_ERROR and CEPHADM_CERT_WARNING as they're handled specially above with message details
+            if name in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'):
+                continue
              v = 1 if info.active else 0
              self.metrics['health_detail'].set(
                  v, (
                      name,
-                    str(info.severity))
+                    str(info.severity),
+                    '')
              )
  
      @profile_method()
author	Abhishek Desai <abhishek.desai1@ibm.com>
	Thu, 30 Oct 2025 04:40:27 +0000 (10:10 +0530)
committer	Abhishek Desai <abhishek.desai1@ibm.com>
	Mon, 23 Feb 2026 14:51:29 +0000 (20:21 +0530)
PendingReleaseNotes		patch \| blob \| history
monitoring/ceph-mixin/prometheus_alerts.libsonnet		patch \| blob \| history
monitoring/ceph-mixin/prometheus_alerts.yml		patch \| blob \| history
monitoring/ceph-mixin/tests_alerts/test_alerts.yml		patch \| blob \| history
src/pybind/mgr/prometheus/module.py		patch \| blob \| history