]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
monitoring: fix CephPgImbalance alert rule expression 66829/head
authorAashish Sharma <Aashish.Sharma1@ibm.com>
Thu, 11 Dec 2025 08:38:11 +0000 (14:08 +0530)
committerAashish Sharma <aashish@li-e9bf2ecc-2ad7-11b2-a85c-baf05c5182ab.ibm.com>
Thu, 8 Jan 2026 04:55:13 +0000 (10:25 +0530)
The alert CephPGImbalance doesn't take any device classes configured into account. As a result, there can be false positives when using mixed-size OSD disks.
Ref: https://github.com/rook/rook/discussions/13126#discussioncomment-10043490

Fixes: https://tracker.ceph.com/issues/69690
Signed-off-by: Aashish Sharma <aasharma@redhat.com>
(cherry picked from commit 5b4f7373655fa829af359d6e3cc61416964a97f0)

Conflicts:
monitoring/ceph-mixin/prometheus_alerts.yml (remove cluster
label from alert since its not there in squid)
monitoring/ceph-mixin/tests_alerts/test_alerts.yml (remove
cluster label from the alert since its not there in squid)

monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index ba06f01f1f91ccf6a995efa7fbc869c4e7d7067c..afc09c562c1252d4e65ede5f560ed72333502e74 100644 (file)
@@ -237,13 +237,25 @@ groups:
           type: "ceph_default"
       - alert: "CephPGImbalance"
         annotations:
-          description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count."
+          description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count in the device class {{ $labels.device_class }}."
           summary: "PGs are not balanced across OSDs"
         expr: |
           abs(
-            ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) /
-            on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-          ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+            (
+              (
+                (ceph_osd_numpg > 0)
+                * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata
+              )
+              - on (job, device_class) group_left avg(
+                  (ceph_osd_numpg > 0)
+                  * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata
+                ) by (job, device_class)
+            )
+            / on (job, device_class) group_left avg(
+                (ceph_osd_numpg > 0)
+                * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata
+              ) by (job, device_class)
+          ) > 0.30
         for: "5m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.4.5"
index 0efc3c9ad24957d83f46f5356180f004665472a4..07be8a282854e36b9c68a23e155547995d44f9da 100644 (file)
@@ -176,48 +176,36 @@ tests:
     - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
       job="ceph"}'
       values: '100 100 100 100 100 160'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.1",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.2",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.3",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}'
       values: '1 1 1 1 1 1'
    promql_expr_test:
      - expr: |
          abs(
            (
-             (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
-             by (job)
-           ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-         ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
-
+             (
+               (ceph_osd_numpg > 0)
+               * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata
+             )
+             - on (job, device_class) group_left avg(
+                (ceph_osd_numpg > 0)
+                * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata
+              ) by (job, device_class)
+           )
+           / on (job, device_class) group_left avg(
+              (ceph_osd_numpg > 0)
+              * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata
+           ) by (job, device_class)
+          ) > 0.30
        eval_time: 5m
        exp_samples:
-         - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
-           job="ceph"}'
-           value: 6E-01
+        - labels: '{ceph_daemon="osd.1", hostname="ceph", device_class="hdd", instance="ceph:9283", job="ceph"}'
+          value: 0.60
    alert_rule_test:
      - eval_time: 10m
        alertname: CephPGImbalance
@@ -225,6 +213,7 @@ tests:
        - exp_labels:
            ceph_daemon: osd.1
            hostname: ceph
+           device_class: hdd
            instance: ceph:9283
            job: ceph
            oid: 1.3.6.1.4.1.50495.1.2.1.4.5
@@ -232,7 +221,7 @@ tests:
            type: ceph_default
          exp_annotations:
            summary: PGs are not balanced across OSDs
-           description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
+           description: "OSD osd.1 on ceph deviates by more than 30% from average PG count in the device class hdd."
 
  # pgs inactive
  - interval: 1m