From e0c016ca30037c76c4feaaf80d2ea4ba432e430f Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Thu, 24 Apr 2025 17:38:12 +0530 Subject: [PATCH] monitoring: Fix NVMeoF subsys/namespace limit alerts Change NVMeoFTooManyNamespaces and NVMeoFTooManySubsystems alert to trigger for ">= $limit" instead of "> $limit". Signed-off-by: Vallari Agrawal (cherry picked from commit 73dadbd269bebe1529a7c247725c5b6aabb1a093) --- monitoring/ceph-mixin/prometheus_alerts.libsonnet | 12 ++++++------ monitoring/ceph-mixin/prometheus_alerts.yml | 12 ++++++------ monitoring/ceph-mixin/tests_alerts/test_alerts.yml | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 3fb84e24206e..f32798093c0b 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -928,21 +928,21 @@ { alert: 'NVMeoFTooManySubsystems', 'for': '1m', - expr: 'count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway], + expr: 'count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway], labels: { severity: 'warning', type: 'ceph_default' }, annotations: { - summary: 'The number of subsystems defined to the gateway exceeds supported values%(cluster)s' % $.MultiClusterSummary(), - description: 'Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported', + summary: 'The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values%(cluster)s' % $.MultiClusterSummary(), + description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of %(NVMeoFMaxSubsystemsPerGateway)d subsystems. Current count: {{ $value }}.' % $._config, }, }, { alert: 'NVMeoFTooManyNamespaces', 'for': '1m', - expr: 'sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) > %.2f' % [$._config.NVMeoFMaxNamespaces], + expr: 'sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= %.2f' % [$._config.NVMeoFMaxNamespaces], labels: { severity: 'warning', type: 'ceph_default' }, annotations: { - summary: 'The number of namespaces defined to the gateway exceeds supported values%(cluster)s' % $.MultiClusterSummary(), - description: 'Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported', + summary: 'The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values%(cluster)s' % $.MultiClusterSummary(), + description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of %(NVMeoFMaxNamespaces)d namespaces. Current count: {{ $value }}.' % $._config, }, }, { diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 1ce17ac07cd8..22d121e70764 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -830,18 +830,18 @@ groups: type: "ceph_default" - alert: "NVMeoFTooManySubsystems" annotations: - description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported" - summary: "The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}" - expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 128.00" + description: "NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of 128 subsystems. Current count: {{ $value }}." + summary: "The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values on cluster {{ $labels.cluster }}" + expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) >= 128.00" for: "1m" labels: severity: "warning" type: "ceph_default" - alert: "NVMeoFTooManyNamespaces" annotations: - description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported" - summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}" - expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00" + description: "NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of 2048 namespaces. Current count: {{ $value }}." + summary: "The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values on cluster {{ $labels.cluster }}" + expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) >= 2048.00" for: "1m" labels: severity: "warning" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 69e7b3a4b9cb..1184aa084db1 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2838,8 +2838,8 @@ tests: cluster: mycluster type: ceph_default exp_annotations: - summary: "The number of subsystems defined to the gateway exceeds supported values on cluster mycluster" - description: "Although you may continue to create subsystems in node-1, the configuration may not be supported" + summary: "The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values on cluster mycluster" + description: "NVMeoF gateway node-1 has reached or exceeded the supported maximum of 128 subsystems. Current count: 129." # NVMeoFTooManyNamespaces - interval: 1m @@ -2882,8 +2882,8 @@ tests: cluster: mycluster type: ceph_default exp_annotations: - summary: "The number of namespaces defined to the gateway exceeds supported values on cluster mycluster" - description: "Although you may continue to create namespaces in node-1, the configuration may not be supported" + summary: "The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values on cluster mycluster" + description: "NVMeoF gateway node-1 has reached or exceeded the supported maximum of 2048 namespaces. Current count: 2200." # NVMeoFVersionMismatch - interval: 1m -- 2.47.3