From: Vallari Agrawal Date: Thu, 24 Apr 2025 12:08:12 +0000 (+0530) Subject: monitoring: Fix NVMeoF subsys/namespace limit alerts X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=73dadbd269bebe1529a7c247725c5b6aabb1a093;p=ceph.git monitoring: Fix NVMeoF subsys/namespace limit alerts Change NVMeoFTooManyNamespaces and NVMeoFTooManySubsystems alert to trigger for ">= $limit" instead of "> $limit". Signed-off-by: Vallari Agrawal --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 3fb84e24206ec..f32798093c0bb 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -928,21 +928,21 @@ { alert: 'NVMeoFTooManySubsystems', 'for': '1m', - expr: 'count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway], + expr: 'count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway], labels: { severity: 'warning', type: 'ceph_default' }, annotations: { - summary: 'The number of subsystems defined to the gateway exceeds supported values%(cluster)s' % $.MultiClusterSummary(), - description: 'Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported', + summary: 'The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values%(cluster)s' % $.MultiClusterSummary(), + description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of %(NVMeoFMaxSubsystemsPerGateway)d subsystems. Current count: {{ $value }}.' % $._config, }, }, { alert: 'NVMeoFTooManyNamespaces', 'for': '1m', - expr: 'sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) > %.2f' % [$._config.NVMeoFMaxNamespaces], + expr: 'sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= %.2f' % [$._config.NVMeoFMaxNamespaces], labels: { severity: 'warning', type: 'ceph_default' }, annotations: { - summary: 'The number of namespaces defined to the gateway exceeds supported values%(cluster)s' % $.MultiClusterSummary(), - description: 'Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported', + summary: 'The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values%(cluster)s' % $.MultiClusterSummary(), + description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of %(NVMeoFMaxNamespaces)d namespaces. Current count: {{ $value }}.' % $._config, }, }, { diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 1ce17ac07cd85..22d121e70764b 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -830,18 +830,18 @@ groups: type: "ceph_default" - alert: "NVMeoFTooManySubsystems" annotations: - description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported" - summary: "The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}" - expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 128.00" + description: "NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of 128 subsystems. Current count: {{ $value }}." + summary: "The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values on cluster {{ $labels.cluster }}" + expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) >= 128.00" for: "1m" labels: severity: "warning" type: "ceph_default" - alert: "NVMeoFTooManyNamespaces" annotations: - description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported" - summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}" - expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00" + description: "NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of 2048 namespaces. Current count: {{ $value }}." + summary: "The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values on cluster {{ $labels.cluster }}" + expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) >= 2048.00" for: "1m" labels: severity: "warning" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 69e7b3a4b9cbc..1184aa084db1e 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2838,8 +2838,8 @@ tests: cluster: mycluster type: ceph_default exp_annotations: - summary: "The number of subsystems defined to the gateway exceeds supported values on cluster mycluster" - description: "Although you may continue to create subsystems in node-1, the configuration may not be supported" + summary: "The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values on cluster mycluster" + description: "NVMeoF gateway node-1 has reached or exceeded the supported maximum of 128 subsystems. Current count: 129." # NVMeoFTooManyNamespaces - interval: 1m @@ -2882,8 +2882,8 @@ tests: cluster: mycluster type: ceph_default exp_annotations: - summary: "The number of namespaces defined to the gateway exceeds supported values on cluster mycluster" - description: "Although you may continue to create namespaces in node-1, the configuration may not be supported" + summary: "The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values on cluster mycluster" + description: "NVMeoF gateway node-1 has reached or exceeded the supported maximum of 2048 namespaces. Current count: 2200." # NVMeoFVersionMismatch - interval: 1m