From: Vallari Agrawal Date: Wed, 26 Feb 2025 16:01:54 +0000 (+0530) Subject: monitoring: fix NVMeoFSubsystemNamespaceLimit X-Git-Tag: testing/wip-vshankar-testing-20250306.055054-debug~2^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=4a7866aeccfa29a7f1e46f8ca21a2487345d0d73;p=ceph-ci.git monitoring: fix NVMeoFSubsystemNamespaceLimit Alert is not triggered as expected, change the query to fix that. BZ: https://bugzilla.redhat.com/show_bug.cgi?id=2282348 Signed-off-by: Vallari Agrawal --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index f3e06a76f1f..2546a676222 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -848,7 +848,7 @@ { alert: 'NVMeoFSubsystemNamespaceLimit', 'for': '1m', - expr: '(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit', + expr: '(count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata)) >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit', labels: { severity: 'warning', type: 'ceph_default' }, annotations: { summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces%(cluster)s' % $.MultiClusterSummary(), diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index f8bcd4ca0e1..33636f4ef41 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -760,7 +760,7 @@ groups: annotations: description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}" summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces on cluster {{ $labels.cluster }}" - expr: "(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit" + expr: "(count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata)) >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit" for: "1m" labels: severity: "warning"