From: Vallari Agrawal Date: Fri, 5 Jun 2026 05:14:51 +0000 (+0530) Subject: monitoring: fix NVMeoFMultipleNamespacesOfRBDImage X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7114fd10f67265410f0d8a9d96977ce94159959c;p=ceph.git monitoring: fix NVMeoFMultipleNamespacesOfRBDImage Do not trigger alert NVMeoFMultipleNamespacesOfRBDImage for same pool/image name used in multiple nvmeof namespaces, if they are in different rados namespaces (rados_namespace_name) These are valid repeation of pool/image name: - mypool/rados_ns1/myimage1 - mypool/rados_ns2/myimage1 - mypool/myimage1 (default rados namespace) Fixes: https://tracker.ceph.com/issues/77128 Signed-off-by: Vallari Agrawal --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index 9c1c3db4375..6173a9dd781 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -858,10 +858,10 @@ { alert: 'NVMeoFMultipleNamespacesOfRBDImage', 'for': '1m', - expr: 'count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1', + expr: 'count by(pool_name, rbd_name, rados_namespace_name) (count by(bdev_name, pool_name, rbd_name, rados_namespace_name) (ceph_nvmeof_bdev_metadata * on (bdev_name, instance, cluster) group_left(rados_namespace_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1', labels: { severity: 'warning', type: 'ceph_default' }, annotations: { - summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ', + summary: 'RBD image {{ $labels.pool_name }}/{{ if $labels.rados_namespace_name }}{{ $labels.rados_namespace_name }}/{{ end }}{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespaces', description: 'Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups.', }, }, diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 8a6c411a2b6..f2aa4defabd 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -804,8 +804,8 @@ groups: - alert: "NVMeoFMultipleNamespacesOfRBDImage" annotations: description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." - summary: "RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace " - expr: "count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1" + summary: "RBD image {{ $labels.pool_name }}/{{ if $labels.rados_namespace_name }}{{ $labels.rados_namespace_name }}/{{ end }}{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespaces" + expr: "count by(pool_name, rbd_name, rados_namespace_name) (count by(bdev_name, pool_name, rbd_name, rados_namespace_name) (ceph_nvmeof_bdev_metadata * on (bdev_name, instance, cluster) group_left(rados_namespace_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1" for: "1m" labels: severity: "warning" @@ -995,4 +995,3 @@ groups: oid: "1.3.6.1.4.1.50495.1.2.1.15.2" severity: "warning" type: "ceph_default" - diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 5578544c934..6a4c2cd2076 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2278,19 +2278,27 @@ tests: # NVMeoFMultipleNamespacesOfRBDImage - interval: 1m input_series: - - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster", pool_name="mypool", rbd_name="myimage1"}' values: '1x10' - - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev1", instance="ceph-nvme-vm2", cluster="mycluster", pool_name="mypool", rbd_name="myimage1"}' values: '1x10' - - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage2"}' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm1", cluster="mycluster", pool_name="mypool", rbd_name="myimage2"}' values: '1x10' - - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage2"}' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev2", instance="ceph-nvme-vm2", cluster="mycluster", pool_name="mypool", rbd_name="myimage2"}' values: '1x10' - - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm1", cluster="mycluster", pool_name="mypool", rbd_name="myimage1"}' values: '1x10' - - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", pool_name="mypool", rbd_name="myimage1"}' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster", pool_name="mypool", rbd_name="myimage1"}' values: '1x10' - - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev4", instance="ceph-nvme-vm1", cluster="mycluster", pool_name="mypool", rbd_name="myimage1"}' # bdev with no ns + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev5", instance="ceph-nvme-vm1", cluster="mycluster", pool_name="mypool", rbd_name="myimage5"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev5", instance="ceph-nvme-vm2", cluster="mycluster", pool_name="mypool", rbd_name="myimage5"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev6", instance="ceph-nvme-vm1", cluster="mycluster", pool_name="mypool", rbd_name="myimage5"}' + values: '1x10' + - series: 'ceph_nvmeof_bdev_metadata{bdev_name="bdev6", instance="ceph-nvme-vm2", cluster="mycluster", pool_name="mypool", rbd_name="myimage5"}' values: '1x10' - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn1", nsid="1", bdev_name="bdev1", instance="ceph-nvme-vm1", cluster="mycluster"}' values: '1x10' @@ -2304,8 +2312,16 @@ tests: values: '1x10' - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn2", nsid="1", bdev_name="bdev3", instance="ceph-nvme-vm2", cluster="mycluster"}' values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn3", nsid="1", bdev_name="bdev5", instance="ceph-nvme-vm1", cluster="mycluster", rados_namespace_name="rados1"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn3", nsid="1", bdev_name="bdev5", instance="ceph-nvme-vm2", cluster="mycluster", rados_namespace_name="rados1"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn3", nsid="2", bdev_name="bdev6", instance="ceph-nvme-vm1", cluster="mycluster", rados_namespace_name="rados2"}' + values: '1x10' + - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="nqn3", nsid="2", bdev_name="bdev6", instance="ceph-nvme-vm2", cluster="mycluster", rados_namespace_name="rados2"}' + values: '1x10' promql_expr_test: - - expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1 + - expr: count by(pool_name, rbd_name, rados_namespace_name) (count by(bdev_name, pool_name, rbd_name, rados_namespace_name) (ceph_nvmeof_bdev_metadata * on (bdev_name, instance, cluster) group_left(rados_namespace_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1 eval_time: 1m exp_samples: - labels: '{pool_name="mypool", rbd_name="myimage1"}' @@ -2320,7 +2336,7 @@ tests: severity: warning type: ceph_default exp_annotations: - summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespace " + summary: "RBD image mypool/myimage1 cannot be reused for multiple NVMeoF namespaces" description: "Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups." # NVMeoFTooManyGateways