From 3821548a37373f87109ab0dac7f3ee2d8f3ead99 Mon Sep 17 00:00:00 2001 From: Patrick Seidensal Date: Wed, 23 Mar 2022 14:53:58 +0100 Subject: [PATCH] mgr/dashboard: Compare values of MTU alert by device Fixes: https://tracker.ceph.com/issues/55004 Signed-off-by: Patrick Seidensal --- monitoring/ceph-mixin/prometheus_alerts.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index fc38678f99dd5..578596f4af0bc 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -704,7 +704,18 @@ groups: rate of the past 48 hours. - alert: CephNodeInconsistentMTU - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"})) + expr: | + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) + or + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) labels: severity: warning type: ceph_default @@ -712,7 +723,7 @@ groups: summary: MTU settings across Ceph hosts are inconsistent description: > Node {{ $labels.instance }} has a different MTU size ({{ $value }}) - than the median value on device {{ $labels.device }}. + than the median of devices named {{ $labels.device }}. - name: pools rules: -- 2.39.5