From: Vallari Agrawal Date: Tue, 8 Oct 2024 21:07:48 +0000 (+0530) Subject: monitoring: add 2 nvmeof alerts to prometheus_alerts.yaml X-Git-Tag: testing/wip-vshankar-testing-20241118.055430-debug~11^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=f02e3128441f5bba95c6173d2acb61baa4111d01;p=ceph-ci.git monitoring: add 2 nvmeof alerts to prometheus_alerts.yaml - `NVMeoFMissingListener`: trigger if all listeners are not created for each gateway in a subsystem - `NVMeoFZeroListenerSubsystem`: trigger if a subsystem has no listeners Signed-off-by: Vallari Agrawal --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index ba6a6ded0a3..805ecb1188a 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -837,6 +837,24 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "NVMeoFMissingListener" + annotations: + description: "For every subsystem, each gateway should have a listener to balance traffic between gateways." + summary: "No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem" + expr: "ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0" + for: "10m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFZeroListenerSubsystem" + annotations: + description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners." + summary: "No listeners added to {{ $labels.nqn }} subsystem" + expr: "sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0" + for: "10m" + labels: + severity: "warning" + type: "ceph_default" - alert: "NVMeoFHighHostCPU" annotations: description: "High CPU on a gateway host can lead to CPU contention and performance degradation"