From: Vallari Agrawal <vallari.agrawal@ibm.com>
Date: Fri, 20 Jun 2025 14:37:00 +0000 (+0530)
Subject: monitoring: Add alert NVMeoFHostKeepAliveTimeout
X-Git-Tag: v20.1.0~102^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F64367%2Fhead;p=ceph.git

monitoring: Add alert NVMeoFHostKeepAliveTimeout

Fixes: https://tracker.ceph.com/issues/71772

Signed-off-by: Vallari Agrawal <vallari.agrawal@ibm.com>
(cherry picked from commit 9977e5c963d6d9c303cf50335f2b6a52e60fba6e)
---

diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet
index 988168007d4..ebe22e425f5 100644
--- a/monitoring/ceph-mixin/config.libsonnet
+++ b/monitoring/ceph-mixin/config.libsonnet
@@ -16,6 +16,7 @@
     NVMeoFMaxSubsystemsPerGateway: 128,
     NVMeoFMaxNamespaces: 2048,
     NVMeoFHighClientCount: 128,
+    NVMeoFHostKeepAliveTimeoutTrackDurationHours: 24,
     NVMeoFHighHostCPU: 80,
     //
     // Read/Write latency is defined in ms
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index f32798093c0..05bbcb9c2ad 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -1035,6 +1035,16 @@
             description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate',
           },
         },
+        {
+          alert: 'NVMeoFHostKeepAliveTimeout',
+          'for': '1m',
+          expr: 'ceil(changes(ceph_nvmeof_host_keepalive_timeout[%(NVMeoFHostKeepAliveTimeoutTrackDurationHours)dh:]) / 2) > 0' % $._config,
+          labels: { severity: 'warning', type: 'ceph_default' },
+          annotations: {
+            summary: 'Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last %(NVMeoFHostKeepAliveTimeoutTrackDurationHours)d hours' % $._config,
+            description: 'Host was disconnected due to host keep alive timeout',
+          },
+        },
       ],
     },
   ],
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 22d121e7076..5f840d93a98 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -928,3 +928,12 @@ groups:
         labels:
           severity: "warning"
           type: "ceph_default"
+      - alert: "NVMeoFHostKeepAliveTimeout"
+        annotations:
+          description: "Host was disconnected due to host keep alive timeout"
+          summary: "Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last 24 hours"
+        expr: "ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 1184aa084db..fdb892ff057 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -3152,3 +3152,55 @@ tests:
         exp_annotations:
           summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1"
           description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
+
+ # NVMeoFHostKeepAliveTimeout
+ - interval: 1h
+   input_series:
+    - series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+      values: '0 0 0 0 1 0 1 0 1 1 0x14 1 1 0x3'
+    - series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+      values: '0 1 1 0 0 0 0 0 0 0 0x19'
+   promql_expr_test:
+     - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
+       eval_time: 2h
+       exp_samples:
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 1
+     - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
+       eval_time: 8h
+       exp_samples:
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 3
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 1
+     - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
+       eval_time: 29h
+       exp_samples:
+         - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}'
+           value: 3
+   alert_rule_test:
+    - eval_time: 1h
+      alertname: NVMeoFHostKeepAliveTimeout
+    - eval_time: 12h
+      alertname: NVMeoFHostKeepAliveTimeout
+      exp_alerts:
+      - exp_labels:
+          gw_name: client.nvmeof.a
+          host_nqn: nqn.1
+          instance: node-1:10008
+          nqn: nqn.2016-06.io.spdk:cnode1.mygroup
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Host (nqn.1) was disconnected 3 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
+          description: "Host was disconnected due to host keep alive timeout"
+      - exp_labels:
+          gw_name: client.nvmeof.a
+          host_nqn: nqn.2
+          instance: node-1:10008
+          nqn: nqn.2016-06.io.spdk:cnode1.mygroup
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours"
+          description: "Host was disconnected due to host keep alive timeout"