From: Vallari Agrawal Date: Fri, 20 Jun 2025 14:37:00 +0000 (+0530) Subject: monitoring: Add alert NVMeoFHostKeepAliveTimeout X-Git-Tag: v20.1.0~102^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F64367%2Fhead;p=ceph.git monitoring: Add alert NVMeoFHostKeepAliveTimeout Fixes: https://tracker.ceph.com/issues/71772 Signed-off-by: Vallari Agrawal (cherry picked from commit 9977e5c963d6d9c303cf50335f2b6a52e60fba6e) --- diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet index 988168007d44..ebe22e425f5b 100644 --- a/monitoring/ceph-mixin/config.libsonnet +++ b/monitoring/ceph-mixin/config.libsonnet @@ -16,6 +16,7 @@ NVMeoFMaxSubsystemsPerGateway: 128, NVMeoFMaxNamespaces: 2048, NVMeoFHighClientCount: 128, + NVMeoFHostKeepAliveTimeoutTrackDurationHours: 24, NVMeoFHighHostCPU: 80, // // Read/Write latency is defined in ms diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index f32798093c0b..05bbcb9c2add 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -1035,6 +1035,16 @@ description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate', }, }, + { + alert: 'NVMeoFHostKeepAliveTimeout', + 'for': '1m', + expr: 'ceil(changes(ceph_nvmeof_host_keepalive_timeout[%(NVMeoFHostKeepAliveTimeoutTrackDurationHours)dh:]) / 2) > 0' % $._config, + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last %(NVMeoFHostKeepAliveTimeoutTrackDurationHours)d hours' % $._config, + description: 'Host was disconnected due to host keep alive timeout', + }, + }, ], }, ], diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 22d121e70764..5f840d93a981 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -928,3 +928,12 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "NVMeoFHostKeepAliveTimeout" + annotations: + description: "Host was disconnected due to host keep alive timeout" + summary: "Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last 24 hours" + expr: "ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 1184aa084db1..fdb892ff0571 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -3152,3 +3152,55 @@ tests: exp_annotations: summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1" description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" + + # NVMeoFHostKeepAliveTimeout + - interval: 1h + input_series: + - series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}' + values: '0 0 0 0 1 0 1 0 1 1 0x14 1 1 0x3' + - series: 'ceph_nvmeof_host_keepalive_timeout{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}' + values: '0 1 1 0 0 0 0 0 0 0 0x19' + promql_expr_test: + - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0 + eval_time: 2h + exp_samples: + - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}' + value: 1 + - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0 + eval_time: 8h + exp_samples: + - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}' + value: 3 + - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.2", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}' + value: 1 + - expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0 + eval_time: 29h + exp_samples: + - labels: '{gw_name="client.nvmeof.a", host_nqn="nqn.1", instance="node-1:10008", nqn="nqn.2016-06.io.spdk:cnode1.mygroup"}' + value: 3 + alert_rule_test: + - eval_time: 1h + alertname: NVMeoFHostKeepAliveTimeout + - eval_time: 12h + alertname: NVMeoFHostKeepAliveTimeout + exp_alerts: + - exp_labels: + gw_name: client.nvmeof.a + host_nqn: nqn.1 + instance: node-1:10008 + nqn: nqn.2016-06.io.spdk:cnode1.mygroup + severity: warning + type: ceph_default + exp_annotations: + summary: "Host (nqn.1) was disconnected 3 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours" + description: "Host was disconnected due to host keep alive timeout" + - exp_labels: + gw_name: client.nvmeof.a + host_nqn: nqn.2 + instance: node-1:10008 + nqn: nqn.2016-06.io.spdk:cnode1.mygroup + severity: warning + type: ceph_default + exp_annotations: + summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours" + description: "Host was disconnected due to host keep alive timeout"