adjusting FEATURE_TOGGLE_DASHBOARD.
* DASHBOARD: RGW Service form updated to take input regarding QAT compression.
- QAT compression is an optional field which can be set to 'Hardware' or 'Software' by selecting options from provided dropdwon. If 'None' is selected, compression is removed altogether.
+* Monitoring: Added new Prometheus alerts for certificate management:
+ - CephCertificateError: Fires when a Ceph certificate has expired (critical severity).
+ - CephCertificateWarning: Fires when a Ceph certificate is about to expire (warning severity).
* CephFS: The `peer_add` command is deprecated in favor of the `peer_bootstrap` command.
* RADOS: When objects are read during deep scrubs, the data is read in strides,
and the scrubbing process is delayed between each read in order to avoid monopolizing
},
],
},
+ {
+ name: 'certmgr',
+ rules: [
+ {
+ alert: 'CephCertificateError',
+ 'for': '1m',
+ expr: 'ceph_health_detail{name="CEPHADM_CERT_ERROR"} == 1',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.1' },
+ annotations: {
+ summary: 'Ceph certificate error detected%(cluster)s' % $.MultiClusterSummary(),
+ description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.",
+ },
+ },
+ {
+ alert: 'CephCertificateWarning',
+ 'for': '1m',
+ expr: 'ceph_health_detail{name="CEPHADM_CERT_WARNING"} == 1',
+ labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.15.2' },
+ annotations: {
+ summary: 'Ceph certificate warning detected%(cluster)s' % $.MultiClusterSummary(),
+ description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue.",
+ },
+ },
+ ],
+ },
],
}
labels:
severity: "warning"
type: "ceph_default"
+ - name: "certmgr"
+ rules:
+ - alert: "CephCertificateError"
+ annotations:
+ description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+ summary: "Ceph certificate error detected on cluster {{ $labels.cluster }}"
+ expr: "ceph_health_detail{name=\"CEPHADM_CERT_ERROR\"} == 1"
+ for: "1m"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.15.1"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "CephCertificateWarning"
+ annotations:
+ description: "{{ $labels.message }}. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+ summary: "Ceph certificate warning detected on cluster {{ $labels.cluster }}"
+ expr: "ceph_health_detail{name=\"CEPHADM_CERT_WARNING\"} == 1"
+ for: "1m"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.15.2"
+ severity: "warning"
+ type: "ceph_default"
+
exp_annotations:
summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
description: "Host was disconnected due to host keep alive timeout"
+
+# Certificate Management - Error Alert
+ - interval: 1m
+ input_series:
+ - series: 'ceph_health_detail{name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}'
+ values: '1 1 1 1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="CEPHADM_CERT_ERROR"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_ERROR", cluster="mycluster", message="Certificate has expired", severity="HEALTH_ERR"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: CephCertificateError
+ - eval_time: 5m
+ alertname: CephCertificateError
+ exp_alerts:
+ - exp_labels:
+ name: CEPHADM_CERT_ERROR
+ severity: critical
+ cluster: mycluster
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.15.1
+ message: Certificate has expired
+ exp_annotations:
+ summary: Ceph certificate error detected on cluster mycluster
+ description: "Certificate has expired. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
+
+# Certificate Management - Warning Alert
+ - interval: 1m
+ input_series:
+ - series: 'ceph_health_detail{name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}'
+ values: '1 1 1 1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="CEPHADM_CERT_WARNING"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="CEPHADM_CERT_WARNING", cluster="mycluster", message="Certificate expires soon", severity="HEALTH_WARN"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: CephCertificateWarning
+ - eval_time: 5m
+ alertname: CephCertificateWarning
+ exp_alerts:
+ - exp_labels:
+ name: CEPHADM_CERT_WARNING
+ severity: warning
+ cluster: mycluster
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.15.2
+ message: Certificate expires soon
+ exp_annotations:
+ summary: Ceph certificate warning detected on cluster mycluster
+ description: "Certificate expires soon. Please check 'ceph health detail' for more information and take appropriate action to resolve the certificate issue."
alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'),
]
-HEALTHCHECK_DETAIL = ('name', 'severity')
+HEALTHCHECK_DETAIL = ('name', 'severity', 'message')
class Severity(enum.Enum):
self.log.info('Config changed, signaling serve loop to restart engine')
self.config_change_event.set()
+ def _process_cert_health_detail(self, alert_id: str, health_data: dict) -> None:
+ """Process certificate health check details and set metrics."""
+ severity = health_data.get('severity', 'unknown')
+ detail_messages = health_data.get('detail', [])
+ if not detail_messages:
+ return
+
+ for detail_entry in detail_messages:
+ message = detail_entry.get('message', '')
+ if not message:
+ continue
+
+ try:
+ self.metrics['health_detail'].set(
+ 1,
+ (
+ alert_id,
+ str(severity),
+ str(message)
+ )
+ )
+ except Exception as e:
+ self.log.error(f"Failed to process {alert_id} message '{message}': {e}")
+
@profile_method()
def get_health(self) -> None:
# health check is not active, so give it a default of 0
self.metrics[path].set(0)
+ for alert_id in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'):
+ if alert_id in active_names:
+ self._process_cert_health_detail(alert_id, active_healthchecks[alert_id])
+
self.health_history.check(health)
for name, info in self.health_history.healthcheck.items():
+ # Skip CEPHADM_CERT_ERROR and CEPHADM_CERT_WARNING as they're handled specially above with message details
+ if name in ('CEPHADM_CERT_ERROR', 'CEPHADM_CERT_WARNING'):
+ continue
v = 1 if info.active else 0
self.metrics['health_detail'].set(
v, (
name,
- str(info.severity))
+ str(info.severity),
+ '')
)
@profile_method()