]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
monitoring: Update nvmeof alert limits in config
authorVallari Agrawal <vallari.agrawal@ibm.com>
Thu, 9 Jan 2025 10:54:33 +0000 (16:24 +0530)
committerAlexander Indenbaum <aindenba@redhat.com>
Wed, 19 Nov 2025 18:41:52 +0000 (20:41 +0200)
Update these in config.libsonnet:
- NVMeoFMaxGatewaysPerGroup (4->8)
- NVMeoFMaxGatewaysPerCluster (4->32)

Also update prometheus_alerts.yml and test_alerts.yml
accordingly.

Resolves: rhbz#2324172

Signed-off-by: Vallari Agrawal <vallari.agrawal@ibm.com>
(cherry picked from commit f3c18818b9b937550f503d94ca37f75639db5514)

monitoring/ceph-mixin/config.libsonnet
monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index c0af859e459ce0e4668338080ea605bc35d1e263..17b126d7eed63eb019d75c2e5f3dd6092fea8d7e 100644 (file)
@@ -9,8 +9,8 @@
     CephNodeNetworkPacketDropsPerSec: 10,
     CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
     CephRBDMirrorImagesPerDaemonThreshold: 100,
-    NVMeoFMaxGatewaysPerGroup: 4,
-    NVMeoFMaxGatewaysPerCluster: 4,
+    NVMeoFMaxGatewaysPerGroup: 8,
+    NVMeoFMaxGatewaysPerCluster: 32,
     NVMeoFHighGatewayCPU: 80,
     NVMeoFMaxSubsystemsPerGateway: 16,
     NVMeoFHighClientCount: 32,
index ba06f01f1f91ccf6a995efa7fbc869c4e7d7067c..5e0b29e66a17d384b2cb253a775fbaa4ad921022 100644 (file)
@@ -793,18 +793,18 @@ groups:
           type: "ceph_default"
       - alert: "NVMeoFTooManyGateways"
         annotations:
-          description: "You may create many gateways, but 4 is the tested limit"
-          summary: "Max supported gateways exceeded "
-        expr: "count(ceph_nvmeof_gateway_info) > 4.00"
+          description: "You may create many gateways, but 32 is the tested limit"
+          summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00"
         for: "1m"
         labels:
           severity: "warning"
           type: "ceph_default"
       - alert: "NVMeoFMaxGatewayGroupSize"
         annotations:
-          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
-          summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded "
-        expr: "count by(group) (ceph_nvmeof_gateway_info) > 2.00"
+          description: "You may create many gateways in a gateway group, but 8 is the tested limit"
+          summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00"
         for: "1m"
         labels:
           severity: "warning"
index 0efc3c9ad24957d83f46f5356180f004665472a4..ad3a198355c005d34240618a7f25ac973c922bac 100644 (file)
@@ -2249,52 +2249,122 @@ tests:
  # NVMeoFTooManyGateways
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}'
-      values: '1+0x20'      
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}'
-      values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5"}'
-      values: '1+0x20'             
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.6",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.7",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.8",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.9",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.10",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.11",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.12",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.13",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.14",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.15",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.16",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.17",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.18",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.19",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.20",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.21",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.22",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.23",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.24",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.25",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.26",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.27",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.28",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.29",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.30",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.31",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.32",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.33",cluster="mycluster"}'
+      values: '1+0x20'
+
    promql_expr_test:
-     - expr: count(ceph_nvmeof_gateway_info) > 4.00
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00
        eval_time: 1m
        exp_samples:
-         - labels: '{}'
-           value: 5
+         - labels: '{cluster="mycluster"}'
+           value: 33
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFTooManyGateways
       exp_alerts:
       - exp_labels:
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Max supported gateways exceeded "
-          description: "You may create many gateways, but 4 is the tested limit"
+          summary: "Max supported gateways exceeded on cluster mycluster"
+          description: "You may create many gateways, but 32 is the tested limit"
 
  # NVMeoFMaxGatewayGroupSize
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3"}'
-      values: '1+0x20'      
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.9",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.10",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.14",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.11",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.13",cluster="mycluster"}'
       values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}'
-      values: '1+0x20'             
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
+      values: '1+0x20'
    promql_expr_test:
-     - expr: count by(group) (ceph_nvmeof_gateway_info) > 2.00
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 8.00
        eval_time: 1m
        exp_samples:
-         - labels: '{group="group-1"}'
-           value: 3
+         - labels: '{cluster="mycluster",group="group-1"}'
+           value: 9
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFMaxGatewayGroupSize
@@ -2302,10 +2372,11 @@ tests:
       - exp_labels:
           group: group-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Max gateways within a gateway group (group-1) exceeded "
-          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
+          summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
+          description: "You may create many gateways in a gateway group, but 8 is the tested limit"
 
  # NVMeoFSingleGatewayGroup
  - interval: 1m