]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
monitoring: Add alert NVMeoFHostKeepAliveTimeout 64367/head
authorVallari Agrawal <vallari.agrawal@ibm.com>
Fri, 20 Jun 2025 14:37:00 +0000 (20:07 +0530)
committerVallari Agrawal <vallari.agrawal@ibm.com>
Mon, 7 Jul 2025 11:40:24 +0000 (17:10 +0530)
Fixes: https://tracker.ceph.com/issues/71772
Signed-off-by: Vallari Agrawal <vallari.agrawal@ibm.com>
(cherry picked from commit 9977e5c963d6d9c303cf50335f2b6a52e60fba6e)

monitoring/ceph-mixin/config.libsonnet
monitoring/ceph-mixin/prometheus_alerts.libsonnet
monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index 988168007d4499e24f0c7a1543c7f9df68322206..ebe22e425f5b5713e3213fc0cde95fe648eb5603 100644 (file)
@@ -16,6 +16,7 @@
     NVMeoFMaxSubsystemsPerGateway: 128,
     NVMeoFMaxNamespaces: 2048,
     NVMeoFHighClientCount: 128,
+    NVMeoFHostKeepAliveTimeoutTrackDurationHours: 24,
     NVMeoFHighHostCPU: 80,
     //
     // Read/Write latency is defined in ms
index f32798093c0bb990e223f13430db5d5f132e5af9..05bbcb9c2add67f0b90e5d1e9f14fb28885d8370 100644 (file)
             description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate',
           },
         },
+        {
+          alert: 'NVMeoFHostKeepAliveTimeout',
+          'for': '1m',
+          expr: 'ceil(changes(ceph_nvmeof_host_keepalive_timeout[%(NVMeoFHostKeepAliveTimeoutTrackDurationHours)dh:]) / 2) > 0' % $._config,
+          labels: { severity: 'warning', type: 'ceph_default' },
+          annotations: {
+            summary: 'Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last %(NVMeoFHostKeepAliveTimeoutTrackDurationHours)d hours' % $._config,
+            description: 'Host was disconnected due to host keep alive timeout',
+          },
+        },
       ],
     },
   ],
index 22d121e70764b9cf04a52ea8506b154a20ff7693..5f840d93a98131b1894aed3663f0fd4e48aae950 100644 (file)
@@ -928,3 +928,12 @@ groups:
         labels:
           severity: "warning"
           type: "ceph_default"
+      - alert: "NVMeoFHostKeepAliveTimeout"
+        annotations:
+          description: "Host was disconnected due to host keep alive timeout"
+          summary: "Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last 24 hours"
+        expr: "ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
index 1184aa084db1e3a87e7f7585124cb665ffdfe54c..fdb892ff05718e00cb78441d833820909a43bb93 100644 (file)
@@ -3152,3 +3152,55 @@ tests:
         exp_annotations:
           summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1"
           description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
+
+ # NVMeoFHostKeepAliveTimeout
+ - interval: 1h
+   input_series:
+    - series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+      values: '0 0 0 0 1 0 1 0 1 1 0x14 1 1 0x3'
+    - series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+      values: '0 1 1 0 0 0 0 0 0 0 0x19'
+   promql_expr_test:
+     - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
+       eval_time: 2h
+       exp_samples:
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 1
+     - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
+       eval_time: 8h
+       exp_samples:
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 3
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 1
+     - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
+       eval_time: 29h
+       exp_samples:
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 3
+   alert_rule_test:
+    - eval_time: 1h
+      alertname: NVMeoFHostKeepAliveTimeout
+    - eval_time: 12h
+      alertname: NVMeoFHostKeepAliveTimeout
+      exp_alerts:
+      - exp_labels:
+          gw_name: client.nvmeof.a
+          host_nqn: nqn.1
+          instance: node-1:10008
+          nqn: nqn.2016-06.io.spdk:cnode1.mygroup
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Host (nqn.1) was disconnected 3 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
+          description: "Host was disconnected due to host keep alive timeout"
+      - exp_labels:
+          gw_name: client.nvmeof.a
+          host_nqn: nqn.2
+          instance: node-1:10008
+          nqn: nqn.2016-06.io.spdk:cnode1.mygroup
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
+          description: "Host was disconnected due to host keep alive timeout"