From 4a7866aeccfa29a7f1e46f8ca21a2487345d0d73 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Wed, 26 Feb 2025 21:31:54 +0530 Subject: [PATCH] monitoring: fix NVMeoFSubsystemNamespaceLimit Alert is not triggered as expected, change the query to fix that. BZ: https://bugzilla.redhat.com/show_bug.cgi?id=2282348 Signed-off-by: Vallari Agrawal --- monitoring/ceph-mixin/prometheus_alerts.libsonnet | 2 +- monitoring/ceph-mixin/prometheus_alerts.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index f3e06a76f1f7..2546a6762227 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -848,7 +848,7 @@ { alert: 'NVMeoFSubsystemNamespaceLimit', 'for': '1m', - expr: '(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit', + expr: '(count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata)) >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit', labels: { severity: 'warning', type: 'ceph_default' }, annotations: { summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces%(cluster)s' % $.MultiClusterSummary(), diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index f8bcd4ca0e12..33636f4ef413 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -760,7 +760,7 @@ groups: annotations: description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}" summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces on cluster {{ $labels.cluster }}" - expr: "(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit" + expr: "(count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata)) >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit" for: "1m" labels: severity: "warning" -- 2.47.3