From: Abhishek Desai Date: Thu, 30 Oct 2025 04:40:27 +0000 (+0530) Subject: mgr/dashboard : Add certmgr alerts and warings to Prometheus and dashboard X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4b5096cc0de40a1b80bf801b4261bb58b0702e82;p=ceph.git mgr/dashboard : Add certmgr alerts and warings to Prometheus and dashboard fixes : https://tracker.ceph.com/issues/73674 Signed-off-by: Abhishek Desai New changes commit for certmgr alerts --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 446297dad61..ac5893a00e1 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -12,6 +12,9 @@ adjusting FEATURE_TOGGLE_DASHBOARD. * DASHBOARD: RGW Service form updated to take input regarding QAT compression. - QAT compression is an optional field which can be set to 'Hardware' or 'Software' by selecting options from provided dropdwon. If 'None' is selected, compression is removed altogether. +* Monitoring: Added new Prometheus alerts for certificate management: + - CephCertificateError: Fires when a Ceph certificate has expired (critical severity). + - CephCertificateWarning: Fires when a Ceph certificate is about to expire (warning severity). * CephFS: The `peer_add` command is deprecated in favor of the `peer_bootstrap` command. * RADOS: When objects are read during deep scrubs, the data is read in strides, and the scrubbing process is delayed between each read in order to avoid monopolizing diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 05bbcb9c2ad..9c1c3db4375 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -1047,5 +1047,30 @@ }, ], }, + { + name: 'certmgr', + rules: [ + { + alert: 'CephCertificateError', + 'for': '1m', + expr: 'ceph_health_detail{name="CEPHADM_CERT_ERROR"} == 1', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.1' }, + annotations: { + summary: 'Ceph certificate error detected%(cluster)s' % $.MultiClusterSummary(), + description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.", + }, + }, + { + alert: 'CephCertificateWarning', + 'for': '1m', + expr: 'ceph_health_detail{name="CEPHADM_CERT_WARNING"} == 1', + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.2' }, + annotations: { + summary: 'Ceph certificate warning detected%(cluster)s' % $.MultiClusterSummary(), + description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.", + }, + }, + ], + }, ], } diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 87631e81296..8a6c411a2b6 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -973,3 +973,26 @@ groups: labels: severity: "warning" type: "ceph_default" + - name: "certmgr" + rules: + - alert: "CephCertificateError" + annotations: + description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue." + summary: "Ceph certificate error detected on cluster {{ $labels.cluster }}" + expr: "ceph_health_detail{name=\"CEPHADM_CERT_ERROR\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.15.1" + severity: "critical" + type: "ceph_default" + - alert: "CephCertificateWarning" + annotations: + description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue." + summary: "Ceph certificate warning detected on cluster {{ $labels.cluster }}" + expr: "ceph_health_detail{name=\"CEPHADM_CERT_WARNING\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.15.2" + severity: "warning" + type: "ceph_default" + diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 131d9811d56..5578544c934 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -3209,3 +3209,59 @@ tests: exp_annotations: summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours" description: "Host was disconnected due to host keep alive timeout" + +# Certificate Management - Error Alert + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="CEPHADM_CERT_ERROR"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephCertificateError + - eval_time: 5m + alertname: CephCertificateError + exp_alerts: + - exp_labels: + name: CEPHADM_CERT_ERROR + severity: critical + cluster: mycluster + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.15.1 + message: Certificate has expired + exp_annotations: + summary: Ceph certificate error detected on cluster mycluster + description: "Certificate has expired. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue." + +# Certificate Management - Warning Alert + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="CEPHADM_CERT_WARNING"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: CephCertificateWarning + - eval_time: 5m + alertname: CephCertificateWarning + exp_alerts: + - exp_labels: + name: CEPHADM_CERT_WARNING + severity: warning + cluster: mycluster + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.15.2 + message: Certificate expires soon + exp_annotations: + summary: Ceph certificate warning detected on cluster mycluster + description: "Certificate expires soon. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue." diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 26b0da41d6d..5c91d74fa2f 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -149,7 +149,7 @@ HEALTH_CHECKS = [ alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'), ] -HEALTHCHECK_DETAIL = ('name', 'severity') +HEALTHCHECK_DETAIL = ('name', 'severity', 'message') class Severity(enum.Enum): @@ -979,6 +979,30 @@ class Module(MgrModule, OrchestratorClientMixin): self.log.info('Config changed, signaling serve loop to restart engine') self.config_change_event.set() + def _process_cert_health_detail(self, alert_id: str, health_data: dict) -> None: + """Process certificate health check details and set metrics.""" + severity = health_data.get('severity', 'unknown') + detail_messages = health_data.get('detail', []) + if not detail_messages: + return + + for detail_entry in detail_messages: + message = detail_entry.get('message', '') + if not message: + continue + + try: + self.metrics['health_detail'].set( + 1, + ( + alert_id, + str(severity), + str(message) + ) + ) + except Exception as e: + self.log.error(f"Failed to process {alert_id} message '{message}': {e}") + @profile_method() def get_health(self) -> None: @@ -1028,13 +1052,21 @@ class Module(MgrModule, OrchestratorClientMixin): # health check is not active, so give it a default of 0 self.metrics[path].set(0) + for alert_id in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'): + if alert_id in active_names: + self._process_cert_health_detail(alert_id, active_healthchecks[alert_id]) + self.health_history.check(health) for name, info in self.health_history.healthcheck.items(): + # Skip CEPHADM_CERT_ERROR and CEPHADM_CERT_WARNING as they're handled specially above with message details + if name in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'): + continue v = 1 if info.active else 0 self.metrics['health_detail'].set( v, ( name, - str(info.severity)) + str(info.severity), + '') ) @profile_method()