From 61b3289e41d5cf28f130a75beaac56e5591d899d Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Tue, 19 Nov 2024 17:25:49 +0530 Subject: [PATCH] monitoring: Add prometheus alert NVMeoFMultipleNamespacesOfRBDImage NVMeoFMultipleNamespacesOfRBDImage alerts the user if a RBD image is used for multiple namespaces. This is important alerts for cases where namespaces are created on same image for different gateway group. Signed-off-by: Vallari Agrawal --- .../ceph-mixin/prometheus_alerts.libsonnet | 10 ++++ monitoring/ceph-mixin/prometheus_alerts.yml | 9 ++++ .../ceph-mixin/tests_alerts/test_alerts.yml | 48 +++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 143e65f20e7dc..5d1ab49b53340 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -855,6 +855,16 @@ description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}', }, }, + { + alert: 'NVMeoFMultipleNamespacesOfRBDImage', + 'for': '1m', + expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ', + description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.', + }, + }, { alert: 'NVMeoFTooManyGateways', 'for': '1m', diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 3eb8a8db4faf9..3440d761351f1 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -765,6 +765,15 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "NVMeoFMultipleNamespacesOfRBDImage" + annotations: + description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." + summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace " + expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" - alert: "NVMeoFTooManyGateways" annotations: description: "You may create many gateways, but 4 is the tested limit" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index a4e63bbcf7338..b3b29308d08b7 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2270,6 +2270,54 @@ tests: summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster" description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah" +# NVMeoFMultipleNamespacesOfRBDImage + - interval: 1m + input_series: + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage2"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage2"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="2", bdev_name="bdev2", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm1", cluster="mycluster"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster"}' + values: '1x10' + promql_expr_test: + - expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1 + eval_time: 1m + exp_samples: + - labels: '{pool_name="mypool", rbd_name="myimage1"}' + value: 2 + alert_rule_test: + - eval_time: 5m + alertname: NVMeoFMultipleNamespacesOfRBDImage + exp_alerts: + - exp_labels: + pool_name: mypool + rbd_name: myimage1 + severity: warning + type: ceph_default + exp_annotations: + summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace " + description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." + # NVMeoFTooManyGateways - interval: 1m input_series: -- 2.39.5