]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
monitoring: Update nvmeof alert limits in config
authorVallari Agrawal <vallari.agrawal@ibm.com>
Thu, 9 Jan 2025 10:54:33 +0000 (16:24 +0530)
committerVallari Agrawal <vallari.agrawal@ibm.com>
Thu, 9 Jan 2025 10:59:29 +0000 (16:29 +0530)
Update these in config.libsonnet:
- NVMeoFMaxGatewaysPerGroup (4->8)
- NVMeoFMaxGatewaysPerCluster (4->32)
- NVMeoFMaxNamespaces (1024->2048)
- NVMeoFHighClientCount (32->128)

Also update prometheus_alerts.yml and test_alerts.yml
accordingly.

Signed-off-by: Vallari Agrawal <vallari.agrawal@ibm.com>
monitoring/ceph-mixin/config.libsonnet
monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index a15b88422fcb1939ef4967cae58bd62e55e8d6dc..e917b4c2dacbdb0ad7ab1cf93bf453cae29e5d5c 100644 (file)
@@ -9,12 +9,12 @@
     CephNodeNetworkPacketDropsPerSec: 10,
     CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
     CephRBDMirrorImagesPerDaemonThreshold: 100,
-    NVMeoFMaxGatewaysPerGroup: 4,
-    NVMeoFMaxGatewaysPerCluster: 4,
+    NVMeoFMaxGatewaysPerGroup: 8,
+    NVMeoFMaxGatewaysPerCluster: 32,
     NVMeoFHighGatewayCPU: 80,
     NVMeoFMaxSubsystemsPerGateway: 128,
-    NVMeoFMaxNamespaces: 1024,
-    NVMeoFHighClientCount: 32,
+    NVMeoFMaxNamespaces: 2048,
+    NVMeoFHighClientCount: 128,
     NVMeoFHighHostCPU: 80,
     //
     // Read/Write latency is defined in ms
index 3440d761351f131dfd63d37b1ddce27adf480de2..7c0da4d51a4cf0eff8a864475f1ce283d6d70000 100644 (file)
@@ -776,18 +776,18 @@ groups:
           type: "ceph_default"
       - alert: "NVMeoFTooManyGateways"
         annotations:
-          description: "You may create many gateways, but 4 is the tested limit"
+          description: "You may create many gateways, but 32 is the tested limit"
           summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
-        expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00"
         for: "1m"
         labels:
           severity: "warning"
           type: "ceph_default"
       - alert: "NVMeoFMaxGatewayGroupSize"
         annotations:
-          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+          description: "You may create many gateways in a gateway group, but 8 is the tested limit"
           summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
-        expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -832,7 +832,7 @@ groups:
         annotations:
           description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported"
           summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
-        expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 1024.00"
+        expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -848,9 +848,9 @@ groups:
           type: "ceph_default"
       - alert: "NVMeoFHighClientCount"
         annotations:
-          description: "The supported limit for clients connecting to a subsystem is 32"
+          description: "The supported limit for clients connecting to a subsystem is 128"
           summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
-        expr: "ceph_nvmeof_subsystem_host_count > 32.00"
+        expr: "ceph_nvmeof_subsystem_host_count > 128.00"
         for: "1m"
         labels:
           severity: "warning"
index b3b29308d08b7376fdda2feb500f9a671c9ab759..83b4ff80375179118c1412728e42fa418904ac4f 100644 (file)
@@ -2331,12 +2331,69 @@ tests:
       values: '1+0x20'
     - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}'
       values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.6",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.7",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.8",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.9",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.10",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.11",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.12",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.13",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.14",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.15",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.16",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.17",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.18",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.19",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.20",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.21",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.22",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.23",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.24",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.25",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.26",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.27",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.28",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.29",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.30",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.31",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.32",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.33",cluster="mycluster"}'
+      values: '1+0x20'
+
    promql_expr_test:
-     - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00
        eval_time: 1m
        exp_samples:
          - labels: '{cluster="mycluster"}'
-           value: 5
+           value: 33
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFTooManyGateways
@@ -2347,7 +2404,7 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "Max supported gateways exceeded on cluster mycluster"
-          description: "You may create many gateways, but 4 is the tested limit"
+          description: "You may create many gateways, but 32 is the tested limit"
 
  # NVMeoFMaxGatewayGroupSize
  - interval: 1m
@@ -2362,16 +2419,24 @@ tests:
       values: '1+0x20'
     - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}'
       values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.10",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.14",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.11",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.13",cluster="mycluster"}'
+      values: '1+0x20' 
     - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
       values: '1+0x20'
     - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
       values: '1+0x20'
    promql_expr_test:
-     - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 8.00
        eval_time: 1m
        exp_samples:
          - labels: '{cluster="mycluster",group="group-1"}'
-           value: 5
+           value: 9
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFMaxGatewayGroupSize
@@ -2383,7 +2448,7 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
-          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+          description: "You may create many gateways in a gateway group, but 8 is the tested limit"
 
  # NVMeoFSingleGatewayGroup
  - interval: 1m
@@ -2767,12 +2832,14 @@ tests:
       values: '200+0x10'
     - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}'
       values: '200+0x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}'
+      values: '200+0x10'
    promql_expr_test:
-     - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 1024
+     - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 2048
        eval_time: 1m
        exp_samples:
          - labels: '{gateway_host="node-1", cluster="mycluster"}'
-           value: 2000
+           value: 2200
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFTooManyNamespaces
@@ -2815,15 +2882,15 @@ tests:
  - interval: 1m
    input_series:
     - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}'
-      values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44'
+      values: '2 4 8 10 20 30 40 50 62 74 80 95 100 110 130 130 130 130 130 130'
     - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}'
-      values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16'
+      values: '2 8 16 16 16 16 16 16 16 16 20 20 32 34 34 36 37 37 37 37'
    promql_expr_test:
-     - expr: ceph_nvmeof_subsystem_host_count > 32.00
+     - expr: ceph_nvmeof_subsystem_host_count > 128.00
        eval_time: 15m
        exp_samples:
          - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}'
-           value: 38
+           value: 130
    alert_rule_test:
     - eval_time: 20m
       alertname: NVMeoFHighClientCount
@@ -2835,7 +2902,7 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
-          description: "The supported limit for clients connecting to a subsystem is 32"
+          description: "The supported limit for clients connecting to a subsystem is 128"
  
  # NVMeoFMissingListener
  - interval: 1m