From: Paul Cuzner <pcuzner@ibm.com>
Date: Tue, 13 Feb 2024 00:55:24 +0000 (+1300)
Subject: ceph-mixins: Add test cases for nvmeof alerts
X-Git-Tag: testing/wip-pdonnell-testing-20240517.203521-squid-debug~74^2~2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=535c79257b54c4c40950665cd61369977f01f846;p=ceph-ci.git

ceph-mixins: Add test cases for nvmeof alerts

Signed-off-by: Paul Cuzner <pcuzner@ibm.com>
(cherry picked from commit c2534a6dbacf0f2ff5c649f7a0e04b5a94dbfdc0)
---

diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 9cb688e9ca0..40d6f4d0983 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -2219,3 +2219,426 @@ tests:
         exp_annotations:
           summary: Fan error(s) detected
           description: "Fan error(s) detected. Check `ceph health detail`."
+
+# nvmeof Tests
+ # NVMeoFSubsystemNamespaceLimit
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_subsystem_namespace_limit{nqn="wah"}'
+      values: '5x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk1"}'
+      values: '1x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk2"}'
+      values: '1x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk3"}'
+      values: '1x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk4"}'
+      values: '1x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk5"}'
+      values: '1x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk6"}'
+      values: '1x10'      
+   promql_expr_test:
+     - expr: (count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit
+       eval_time: 1m
+       exp_samples:
+         - labels: '{nqn="wah"}'
+           value: 6
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFSubsystemNamespaceLimit
+      exp_alerts:
+      - exp_labels:
+          nqn: wah
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "wah subsystem has reached its maximum number of namespaces "
+          description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah"
+
+ # NVMeoFTooManyGateways
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}'
+      values: '1+0x20'      
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}'
+      values: '1+0x20' 
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5"}'
+      values: '1+0x20'             
+   promql_expr_test:
+     - expr: count(ceph_nvmeof_gateway_info) > 4.00
+       eval_time: 1m
+       exp_samples:
+         - labels: '{}'
+           value: 5
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFTooManyGateways
+      exp_alerts:
+      - exp_labels:
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Max supported gateways exceeded "
+          description: "You may create many gateways, but 4 is the tested limit"
+
+ # NVMeoFMaxGatewayGroupSize
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3"}'
+      values: '1+0x20'      
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}'
+      values: '1+0x20' 
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}'
+      values: '1+0x20'             
+   promql_expr_test:
+     - expr: count by(group) (ceph_nvmeof_gateway_info) > 2.00
+       eval_time: 1m
+       exp_samples:
+         - labels: '{group="group-1"}'
+           value: 3
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFMaxGatewayGroupSize
+      exp_alerts:
+      - exp_labels:
+          group: group-1
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Max gateways within a gateway group (group-1) exceeded "
+          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
+
+ # NVMeoFSingleGatewayGroup
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}'
+      values: '1+0x20' 
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}'
+      values: '1+0x20'             
+   promql_expr_test:
+     - expr: count by(group) (ceph_nvmeof_gateway_info) == 1
+       eval_time: 1m
+       exp_samples:
+         - labels: '{group="group-1"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFSingleGatewayGroup
+      exp_alerts:
+      - exp_labels:
+          group: group-1
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "The gateway group group-1 consists of a single gateway - HA is not possible "
+          description: "Although a single member gateway group is valid, it should only be used for test purposes" 
+
+ # NVMeoFHighGatewayCPU
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_reactor_seconds_total{mode="busy",name="nvmf_tgt_poll_group_0",instance="node-1:10008"}'
+      values: '880+5080x20'
+   promql_expr_test:
+     - expr: label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > 80
+       eval_time: 5m
+       exp_samples:
+         - labels: '{instance="node-1"}'
+           value: 8.466666666666667E+01
+   alert_rule_test:
+    - eval_time: 15m
+      alertname: NVMeoFHighGatewayCPU
+      exp_alerts:
+      - exp_labels:
+          instance: node-1
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "CPU used by node-1 NVMe-oF Gateway is high "
+          description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores" 
+ 
+ # NVMeoFGatewayOpenSecurity
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.good", allow_any_host="no"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.bad", allow_any_host="yes"}'
+      values: '1+0x10'
+   promql_expr_test:
+     - expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_nvmeof_subsystem_metadata",nqn="nqn.bad",allow_any_host="yes"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFGatewayOpenSecurity
+      exp_alerts:
+      - exp_labels:
+          allow_any_host: yes
+          nqn: nqn.bad
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Subsystem nqn.bad has been defined without host level security "
+          description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss" 
+
+ # NVMeoFTooManySubsystems
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn1"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn2"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn3"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn4"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn5"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn6"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn7"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn8"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn9"}'
+      values: '1+0x10'                                             
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn10"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn11"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn12"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn13"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn14"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn15"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn16"}'
+      values: '1+0x10'  
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn17"}'
+      values: '1+0x10'  
+   promql_expr_test:
+     - expr: count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > 16
+       eval_time: 1m
+       exp_samples:
+         - labels: '{gateway_host="node-1"}'
+           value: 17
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFTooManySubsystems
+      exp_alerts:
+      - exp_labels:
+          gateway_host: node-1
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "The number of subsystems defined to the gateway exceeds supported values "
+          description: "Although you may continue to create subsystems in node-1, the configuration may not be supported" 
+
+ # NVMeoFVersionMismatch
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_gateway_info{version="0.0.7"}'
+      values: '1+0x80'
+    - series: 'ceph_nvmeof_gateway_info{version="1.0.0"}'
+      values: '1+0x80'
+   promql_expr_test:
+     - expr: count(count by(version) (ceph_nvmeof_gateway_info)) > 1
+       eval_time: 1m
+       exp_samples:
+         - labels: '{}'
+           value: 2
+   alert_rule_test:
+    - eval_time: 1h
+      alertname: NVMeoFVersionMismatch
+      exp_alerts:
+      - exp_labels:
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "The cluster has different NVMe-oF gateway releases active "
+          description: "This may indicate an issue with deployment. Check cephadm logs"  
+
+ # NVMeoFHighClientCount
+ - interval: 1m
+   input_series:
+    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1"}'
+      values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44'
+    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2"}'
+      values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16'
+   promql_expr_test:
+     - expr: ceph_nvmeof_subsystem_host_count > 32.00
+       eval_time: 15m
+       exp_samples:
+         - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1"}'
+           value: 38
+   alert_rule_test:
+    - eval_time: 20m
+      alertname: NVMeoFHighClientCount
+      exp_alerts:
+      - exp_labels:
+          nqn: nqn1
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "The number of clients connected to nqn1 is too high "
+          description: "The supported limit for clients connecting to a subsystem is 32" 
+ 
+ # NVMeoFHighHostCPU
+ - interval: 1m
+   input_series:
+    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="0"}'
+      values: '0+18x10 180+9x20'
+    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="1"}'
+      values: '0+18x10 180+9x20'
+    - series: 'ceph_nvmeof_gateway_info{instance="node-1:10008"}'
+      values: '1.00+0x20'
+   promql_expr_test:
+     - expr: 100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= 80
+       eval_time: 16m
+       exp_samples:
+         - labels: '{host="node-1",instance="node-1:10008"}'
+           value: 85
+   alert_rule_test:
+    # negative match at 15m
+    - eval_time: 15m
+      alertname: NVMeoFHighHostCPU
+    # positive match at 25m      
+    - eval_time: 25m
+      alertname: NVMeoFHighHostCPU
+      exp_alerts:
+      - exp_labels:
+          instance: node-1:10008
+          host: node-1
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "The CPU is high (85%) on NVMeoF Gateway host (node-1) "
+          description: "High CPU on a gateway host can lead to CPU contention and performance degradation"  
+
+ # NVMeoFInterfaceDown - triggered on eth0 only
+ - interval: 30s
+   input_series:
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down", device="eth0"}'
+      values: '1+0x30'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="up", device="eth1"}'
+      values: '1+0x30'      
+   promql_expr_test:
+     - expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth0", operstate="down"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFInterfaceDown
+      exp_alerts:
+      - exp_labels:
+          oid: 1.3.6.1.4.1.50495.1.2.1.14.1
+          operstate: down
+          device: eth0
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Network interface eth0 is down "
+          description: "A NIC used by one or more subsystems is in a down state" 
+
+ # NVMeoFInterfaceDuplex - triggered on eth1 only
+ - interval: 30s
+   input_series:
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="full", device="eth0"}'
+      values: '1+0x30'   
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="half", device="eth1"}'
+      values: '1+0x30'
+   promql_expr_test:
+     - expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}
+       eval_time: 30s
+       exp_samples:
+         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth1", duplex="half"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: NVMeoFInterfaceDuplex
+      exp_alerts:
+      - exp_labels:
+          duplex: half
+          device: eth1
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Network interface eth1 is not running in full duplex mode "
+          description: "Until this is resolved, performance from the gateway will be degraded" 
+
+ # NVMeoFHighReadLatency 
+ - interval: 30s
+   input_series:
+    - series: 'ceph_nvmeof_bdev_read_seconds_total{instance="node-1:10008",bdev_name="disk1"}'
+      values: '0+1680x10 19800+3000x20'   
+    - series: 'ceph_nvmeof_bdev_reads_completed_total{instance="node-1:10008",bdev_name="disk1"}'
+      values: '0+286000x10 2980000+120000x20'
+   promql_expr_test:
+     - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02
+       eval_time: 10m
+       exp_samples:
+         - labels: '{gateway="node-1",instance="node-1:10008"}'
+           value: 0.025
+   alert_rule_test:
+    # negative test - latency is lower than 0.02s
+    - eval_time: 4m
+      alertname: NVMeoFHighReadLatency
+    # positive test - latency is higher than 0.02s
+    - eval_time: 15m
+      alertname: NVMeoFHighReadLatency
+      exp_alerts:
+      - exp_labels:
+          gateway: node-1
+          instance: node-1:10008
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "The average read latency over the last 5 mins has reached 10 ms or more on node-1"
+          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" 
+
+ # NVMeoFHighWriteLatency 
+ - interval: 30s
+   input_series:
+    - series: 'ceph_nvmeof_bdev_write_seconds_total{instance="node-1:10008",bdev_name="disk1"}'
+      values: '0+1680x10 19800+3000x20'   
+    - series: 'ceph_nvmeof_bdev_writes_completed_total{instance="node-1:10008",bdev_name="disk1"}'
+      values: '0+286000x10 2980000+120000x20'
+   promql_expr_test:
+     - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[1m]) / rate(ceph_nvmeof_bdev_writes_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02
+       eval_time: 10m
+       exp_samples:
+         - labels: '{gateway="node-1",instance="node-1:10008"}'
+           value: 0.025
+   alert_rule_test:
+    # negative test - latency is lower than 0.02s
+    - eval_time: 4m
+      alertname: NVMeoFHighWriteLatency
+    # positive test - latency is higher than 0.02s
+    - eval_time: 15m
+      alertname: NVMeoFHighWriteLatency
+      exp_alerts:
+      - exp_labels:
+          gateway: node-1
+          instance: node-1:10008
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1"
+          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
+ 
\ No newline at end of file