From: Aashish Sharma Date: Thu, 11 Dec 2025 08:38:11 +0000 (+0530) Subject: monitoring: fix CephPgImbalance alert rule expression X-Git-Tag: v19.2.4~51^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F66829%2Fhead;p=ceph.git monitoring: fix CephPgImbalance alert rule expression The alert CephPGImbalance doesn't take any device classes configured into account. As a result, there can be false positives when using mixed-size OSD disks. Ref: https://github.com/rook/rook/discussions/13126#discussioncomment-10043490 Fixes: https://tracker.ceph.com/issues/69690 Signed-off-by: Aashish Sharma (cherry picked from commit 5b4f7373655fa829af359d6e3cc61416964a97f0) Conflicts: monitoring/ceph-mixin/prometheus_alerts.yml (remove cluster label from alert since its not there in squid) monitoring/ceph-mixin/tests_alerts/test_alerts.yml (remove cluster label from the alert since its not there in squid) --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index ba06f01f1f91..afc09c562c12 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -237,13 +237,25 @@ groups: type: "ceph_default" - alert: "CephPGImbalance" annotations: - description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count." + description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count in the device class {{ $labels.device_class }}." summary: "PGs are not balanced across OSDs" expr: | abs( - ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / - on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + ( + ( + (ceph_osd_numpg > 0) + * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata + ) + - on (job, device_class) group_left avg( + (ceph_osd_numpg > 0) + * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata + ) by (job, device_class) + ) + / on (job, device_class) group_left avg( + (ceph_osd_numpg > 0) + * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata + ) by (job, device_class) + ) > 0.30 for: "5m" labels: oid: "1.3.6.1.4.1.50495.1.2.1.4.5" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 0efc3c9ad249..07be8a282854 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -176,48 +176,36 @@ tests: - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283", job="ceph"}' values: '100 100 100 100 100 160' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' + - series: 'ceph_osd_metadata{ceph_daemon="osd.0",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}' values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' + - series: 'ceph_osd_metadata{ceph_daemon="osd.1",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}' values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' + - series: 'ceph_osd_metadata{ceph_daemon="osd.2",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}' values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' + - series: 'ceph_osd_metadata{ceph_daemon="osd.3",device_class="hdd",hostname="ceph",instance="ceph:9283",job="ceph"}' values: '1 1 1 1 1 1' promql_expr_test: - expr: | abs( ( - (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) - by (job) - ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 - + ( + (ceph_osd_numpg > 0) + * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata + ) + - on (job, device_class) group_left avg( + (ceph_osd_numpg > 0) + * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata + ) by (job, device_class) + ) + / on (job, device_class) group_left avg( + (ceph_osd_numpg > 0) + * on (job, ceph_daemon) group_left(hostname, device_class) ceph_osd_metadata + ) by (job, device_class) + ) > 0.30 eval_time: 5m exp_samples: - - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283", - job="ceph"}' - value: 6E-01 + - labels: '{ceph_daemon="osd.1", hostname="ceph", device_class="hdd", instance="ceph:9283", job="ceph"}' + value: 0.60 alert_rule_test: - eval_time: 10m alertname: CephPGImbalance @@ -225,6 +213,7 @@ tests: - exp_labels: ceph_daemon: osd.1 hostname: ceph + device_class: hdd instance: ceph:9283 job: ceph oid: 1.3.6.1.4.1.50495.1.2.1.4.5 @@ -232,7 +221,7 @@ tests: type: ceph_default exp_annotations: summary: PGs are not balanced across OSDs - description: "OSD osd.1 on ceph deviates by more than 30% from average PG count." + description: "OSD osd.1 on ceph deviates by more than 30% from average PG count in the device class hdd." # pgs inactive - interval: 1m