From: Afreen Misbah Date: Mon, 25 May 2026 09:43:53 +0000 (+0530) Subject: monitoring: add category labels to NVMe-oF alert rules X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f24754723ef47073a92a64628ca1ec9759852c76;p=ceph.git monitoring: add category labels to NVMe-oF alert rules Add a 'category' custom label to all 20 NVMe-oF Prometheus alert rules to enable frontend filtering by resource type. Categories: gateway (6), subsystem (4), listener (4), namespace (2), performance (2), host (2). Fixes: https://tracker.ceph.com/issues/76786 Signed-off-by: Afreen Misbah --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 9c1c3db4375..2e5af729c51 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -849,7 +849,7 @@ alert: 'NVMeoFSubsystemNamespaceLimit', 'for': '1m', expr: '(count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata)) >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'subsystem' }, annotations: { summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces%(cluster)s' % $.MultiClusterSummary(), description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}', @@ -859,7 +859,7 @@ alert: 'NVMeoFMultipleNamespacesOfRBDImage', 'for': '1m', expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'namespace' }, annotations: { summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ', description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.', @@ -869,7 +869,7 @@ alert: 'NVMeoFTooManyGateways', 'for': '1m', expr: 'count(ceph_nvmeof_gateway_info) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'gateway' }, annotations: { summary: 'Max supported gateways exceeded%(cluster)s' % $.MultiClusterSummary(), description: 'You may create many gateways, but %(NVMeoFMaxGatewaysPerCluster)d is the tested limit' % $._config, @@ -879,7 +879,7 @@ alert: 'NVMeoFMaxGatewayGroupSize', 'for': '1m', expr: 'count(ceph_nvmeof_gateway_info) by (cluster,group) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'gateway' }, annotations: { summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded%(cluster)s' % $.MultiClusterSummary(), description: 'You may create many gateways in a gateway group, but %(NVMeoFMaxGatewaysPerGroup)d is the tested limit' % $._config, @@ -889,7 +889,7 @@ alert: 'NVMeoFMaxGatewayGroups', 'for': '1m', expr: 'count(count by (group, cluster) (ceph_nvmeof_gateway_info)) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewayGroups], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'gateway' }, annotations: { summary: 'Max gateway groups exceeded%(cluster)s' % $.MultiClusterSummary(), description: 'You may create many gateway groups, but %(NVMeoFMaxGatewayGroups)d is the tested limit' % $._config, @@ -899,7 +899,7 @@ alert: 'NVMeoFSingleGateway', 'for': '5m', expr: 'count(ceph_nvmeof_gateway_info) by(cluster,group) == 1', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'gateway' }, annotations: { summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible%(cluster)s' % $.MultiClusterSummary(), description: 'Although a single member gateway group is valid, it should only be used for test purposes', @@ -909,7 +909,7 @@ alert: 'NVMeoFHighGatewayCPU', 'for': '10m', expr: 'label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'gateway' }, annotations: { summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high%(cluster)s' % $.MultiClusterSummary(), description: 'Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores', @@ -919,7 +919,7 @@ alert: 'NVMeoFGatewayOpenSecurity', 'for': '5m', expr: 'ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'subsystem' }, annotations: { summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security%(cluster)s' % $.MultiClusterSummary(), description: 'It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss', @@ -929,7 +929,7 @@ alert: 'NVMeoFTooManySubsystems', 'for': '1m', expr: 'count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'subsystem' }, annotations: { summary: 'The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values%(cluster)s' % $.MultiClusterSummary(), description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of %(NVMeoFMaxSubsystemsPerGateway)d subsystems. Current count: {{ $value }}.' % $._config, @@ -939,7 +939,7 @@ alert: 'NVMeoFTooManyNamespaces', 'for': '1m', expr: 'sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= %.2f' % [$._config.NVMeoFMaxNamespaces], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'namespace' }, annotations: { summary: 'The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values%(cluster)s' % $.MultiClusterSummary(), description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of %(NVMeoFMaxNamespaces)d namespaces. Current count: {{ $value }}.' % $._config, @@ -949,7 +949,7 @@ alert: 'NVMeoFVersionMismatch', 'for': '1h', expr: 'count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'gateway' }, annotations: { summary: 'Too many different NVMe-oF gateway releases active%(cluster)s' % $.MultiClusterSummary(), description: 'This may indicate an issue with deployment. Check cephadm logs', @@ -959,7 +959,7 @@ alert: 'NVMeoFHighClientCount', 'for': '1m', expr: 'ceph_nvmeof_subsystem_host_count > %.2f' % [$._config.NVMeoFHighClientCount], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'subsystem' }, annotations: { summary: 'The number of clients connected to {{ $labels.nqn }} is too high%(cluster)s' % $.MultiClusterSummary(), description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config, @@ -969,7 +969,7 @@ alert: 'NVMeoFMissingListener', 'for': '10m', expr: 'ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'listener' }, annotations: { summary: 'No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem', description: 'For every subsystem, each gateway should have a listener to balance traffic between gateways.', @@ -979,7 +979,7 @@ alert: 'NVMeoFZeroListenerSubsystem', 'for': '10m', expr: 'sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'listener' }, annotations: { summary: 'No listeners added to {{ $labels.nqn }} subsystem', description: 'NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners.', @@ -989,7 +989,7 @@ alert: 'NVMeoFHighHostCPU', 'for': '10m', expr: '100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'host' }, annotations: { summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }})%(cluster)s' % $.MultiClusterSummary(), description: 'High CPU on a gateway host can lead to CPU contention and performance degradation', @@ -999,7 +999,7 @@ alert: 'NVMeoFInterfaceDown', 'for': '30s', expr: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}', - labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.14.1' }, + labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.14.1', category: 'listener' }, annotations: { summary: 'Network interface {{ $labels.device }} is down%(cluster)s' % $.MultiClusterSummary(), description: 'A NIC used by one or more subsystems is in a down state', @@ -1009,7 +1009,7 @@ alert: 'NVMeoFInterfaceDuplex', 'for': '30s', expr: 'ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}', - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'listener' }, annotations: { summary: 'Network interface {{ $labels.device }} is not running in full duplex mode%(cluster)s' % $.MultiClusterSummary(), description: 'Until this is resolved, performance from the gateway will be degraded', @@ -1019,7 +1019,7 @@ alert: 'NVMeoFHighReadLatency', 'for': '5m', expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientReadLatency / 1000], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'performance' }, annotations: { summary: 'The average read latency over the last 5 mins has reached %(NVMeoFHighClientReadLatency)d ms or more on {{ $labels.gateway }}' % $._config, description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate', @@ -1029,7 +1029,7 @@ alert: 'NVMeoFHighWriteLatency', 'for': '5m', expr: 'label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighClientWriteLatency / 1000], - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'performance' }, annotations: { summary: 'The average write latency over the last 5 mins has reached %(NVMeoFHighClientWriteLatency)d ms or more on {{ $labels.gateway }}' % $._config, description: 'High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate', @@ -1039,7 +1039,7 @@ alert: 'NVMeoFHostKeepAliveTimeout', 'for': '1m', expr: 'ceil(changes(ceph_nvmeof_host_keepalive_timeout[%(NVMeoFHostKeepAliveTimeoutTrackDurationHours)dh:]) / 2) > 0' % $._config, - labels: { severity: 'warning', type: 'ceph_default' }, + labels: { severity: 'warning', type: 'ceph_default', category: 'host' }, annotations: { summary: 'Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last %(NVMeoFHostKeepAliveTimeoutTrackDurationHours)d hours' % $._config, description: 'Host was disconnected due to host keep alive timeout', diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 8a6c411a2b6..db9120a3de3 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -799,6 +799,7 @@ groups: expr: "(count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata)) >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit" for: "1m" labels: + category: "subsystem" severity: "warning" type: "ceph_default" - alert: "NVMeoFMultipleNamespacesOfRBDImage" @@ -808,6 +809,7 @@ groups: expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1" for: "1m" labels: + category: "namespace" severity: "warning" type: "ceph_default" - alert: "NVMeoFTooManyGateways" @@ -817,6 +819,7 @@ groups: expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00" for: "1m" labels: + category: "gateway" severity: "warning" type: "ceph_default" - alert: "NVMeoFMaxGatewayGroupSize" @@ -826,6 +829,7 @@ groups: expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00" for: "1m" labels: + category: "gateway" severity: "warning" type: "ceph_default" - alert: "NVMeoFMaxGatewayGroups" @@ -835,6 +839,7 @@ groups: expr: "count(count by (group, cluster) (ceph_nvmeof_gateway_info)) by (cluster) > 4.00" for: "1m" labels: + category: "gateway" severity: "warning" type: "ceph_default" - alert: "NVMeoFSingleGateway" @@ -844,6 +849,7 @@ groups: expr: "count(ceph_nvmeof_gateway_info) by(cluster,group) == 1" for: "5m" labels: + category: "gateway" severity: "warning" type: "ceph_default" - alert: "NVMeoFHighGatewayCPU" @@ -853,6 +859,7 @@ groups: expr: "label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00" for: "10m" labels: + category: "gateway" severity: "warning" type: "ceph_default" - alert: "NVMeoFGatewayOpenSecurity" @@ -862,6 +869,7 @@ groups: expr: "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}" for: "5m" labels: + category: "subsystem" severity: "warning" type: "ceph_default" - alert: "NVMeoFTooManySubsystems" @@ -871,6 +879,7 @@ groups: expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) >= 128.00" for: "1m" labels: + category: "subsystem" severity: "warning" type: "ceph_default" - alert: "NVMeoFTooManyNamespaces" @@ -880,6 +889,7 @@ groups: expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) >= 4096.00" for: "1m" labels: + category: "namespace" severity: "warning" type: "ceph_default" - alert: "NVMeoFVersionMismatch" @@ -889,6 +899,7 @@ groups: expr: "count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1" for: "1h" labels: + category: "gateway" severity: "warning" type: "ceph_default" - alert: "NVMeoFHighClientCount" @@ -898,6 +909,7 @@ groups: expr: "ceph_nvmeof_subsystem_host_count > 128.00" for: "1m" labels: + category: "subsystem" severity: "warning" type: "ceph_default" - alert: "NVMeoFMissingListener" @@ -907,6 +919,7 @@ groups: expr: "ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0" for: "10m" labels: + category: "listener" severity: "warning" type: "ceph_default" - alert: "NVMeoFZeroListenerSubsystem" @@ -916,6 +929,7 @@ groups: expr: "sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0" for: "10m" labels: + category: "listener" severity: "warning" type: "ceph_default" - alert: "NVMeoFHighHostCPU" @@ -925,6 +939,7 @@ groups: expr: "100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00" for: "10m" labels: + category: "host" severity: "warning" type: "ceph_default" - alert: "NVMeoFInterfaceDown" @@ -934,6 +949,7 @@ groups: expr: "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}" for: "30s" labels: + category: "listener" oid: "1.3.6.1.4.1.50495.1.2.1.14.1" severity: "warning" type: "ceph_default" @@ -944,6 +960,7 @@ groups: expr: "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}" for: "30s" labels: + category: "listener" severity: "warning" type: "ceph_default" - alert: "NVMeoFHighReadLatency" @@ -953,6 +970,7 @@ groups: expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.01" for: "5m" labels: + category: "performance" severity: "warning" type: "ceph_default" - alert: "NVMeoFHighWriteLatency" @@ -962,6 +980,7 @@ groups: expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.02" for: "5m" labels: + category: "performance" severity: "warning" type: "ceph_default" - alert: "NVMeoFHostKeepAliveTimeout" @@ -971,6 +990,7 @@ groups: expr: "ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0" for: "1m" labels: + category: "host" severity: "warning" type: "ceph_default" - name: "certmgr" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 5578544c934..e768acf5760 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2271,6 +2271,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: subsystem exp_annotations: summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster" description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah" @@ -2316,9 +2317,10 @@ tests: exp_alerts: - exp_labels: pool_name: mypool - rbd_name: myimage1 + rbd_name: myimage1 severity: warning type: ceph_default + category: namespace exp_annotations: summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace " description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." @@ -2407,6 +2409,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: gateway exp_annotations: summary: "Max supported gateways exceeded on cluster mycluster" description: "You may create many gateways, but 32 is the tested limit" @@ -2451,6 +2454,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: gateway exp_annotations: summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster" description: "You may create many gateways in a gateway group, but 8 is the tested limit" @@ -2483,6 +2487,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: gateway exp_annotations: summary: "Max gateway groups exceeded on cluster mycluster" description: "You may create many gateway groups, but 4 is the tested limit" @@ -2511,6 +2516,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: gateway exp_annotations: summary: "The gateway group group-1 consists of a single gateway - HA is not possible on cluster mycluster" description: "Although a single member gateway group is valid, it should only be used for test purposes" @@ -2535,6 +2541,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: gateway exp_annotations: summary: "CPU used by node-1 NVMe-oF Gateway is high on cluster mycluster" description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores" @@ -2562,6 +2569,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: subsystem exp_annotations: summary: "Subsystem nqn.bad has been defined without host level security on cluster mycluster" description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss" @@ -2842,6 +2850,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: subsystem exp_annotations: summary: "The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values on cluster mycluster" description: "NVMeoF gateway node-1 has reached or exceeded the supported maximum of 128 subsystems. Current count: 129." @@ -2886,6 +2895,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: namespace exp_annotations: summary: "The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values on cluster mycluster" description: "NVMeoF gateway node-1 has reached or exceeded the supported maximum of 4096 namespaces. Current count: 4400." @@ -2911,6 +2921,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: gateway exp_annotations: summary: "Too many different NVMe-oF gateway releases active on cluster mycluster" description: "This may indicate an issue with deployment. Check cephadm logs" @@ -2937,6 +2948,7 @@ tests: severity: warning cluster: mycluster type: ceph_default + category: subsystem exp_annotations: summary: "The number of clients connected to nqn1 is too high on cluster mycluster" description: "The supported limit for clients connecting to a subsystem is 128" @@ -2973,6 +2985,7 @@ tests: nqn: nqn1 severity: warning type: ceph_default + category: listener exp_annotations: summary: "No listener added for node-1:9100 NVMe-oF Gateway to nqn1 subsystem" description: "For every subsystem, each gateway should have a listener to balance traffic between gateways." @@ -3006,6 +3019,7 @@ tests: nqn: nqn1 severity: warning type: ceph_default + category: listener exp_annotations: summary: "No listeners added to nqn1 subsystem" description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners." @@ -3039,6 +3053,7 @@ tests: cluster: mycluster severity: warning type: ceph_default + category: host exp_annotations: summary: "The CPU is high (85%) on NVMeoF Gateway host (node-1) on cluster mycluster" description: "High CPU on a gateway host can lead to CPU contention and performance degradation" @@ -3067,6 +3082,7 @@ tests: cluster: mycluster severity: warning type: ceph_default + category: listener exp_annotations: summary: "Network interface eth0 is down on cluster mycluster" description: "A NIC used by one or more subsystems is in a down state" @@ -3094,6 +3110,7 @@ tests: cluster: mycluster severity: warning type: ceph_default + category: listener exp_annotations: summary: "Network interface eth1 is not running in full duplex mode on cluster mycluster" description: "Until this is resolved, performance from the gateway will be degraded" @@ -3124,6 +3141,7 @@ tests: instance: node-1:10008 severity: warning type: ceph_default + category: performance exp_annotations: summary: "The average read latency over the last 5 mins has reached 10 ms or more on node-1" description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" @@ -3154,6 +3172,7 @@ tests: instance: node-1:10008 severity: warning type: ceph_default + category: performance exp_annotations: summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1" description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" @@ -3196,6 +3215,7 @@ tests: nqn: nqn.2016-06.io.spdk:cnode1.mygroup severity: warning type: ceph_default + category: host exp_annotations: summary: "Host (nqn.1) was disconnected 3 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours" description: "Host was disconnected due to host keep alive timeout" @@ -3206,6 +3226,7 @@ tests: nqn: nqn.2016-06.io.spdk:cnode1.mygroup severity: warning type: ceph_default + category: host exp_annotations: summary: "Host (nqn.2) was disconnected 1 times from subsystem (nqn.2016-06.io.spdk:cnode1.mygroup) in last 24 hours" description: "Host was disconnected due to host keep alive timeout"