]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
monitoring: fix MTU Mismatch alert rule and expr fix-mtu-alert 64467/head
authorAashish Sharma <Aashish.Sharma1@ibm.com>
Wed, 2 Jul 2025 11:05:14 +0000 (16:35 +0530)
committerAashish Sharma <Aashish.Sharma1@ibm.com>
Wed, 2 Jul 2025 11:05:14 +0000 (16:35 +0530)
Signed-off-by: Aashish Sharma <aasharma@redhat.com>
monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index 22d121e70764b9cf04a52ea8506b154a20ff7693..4dd419e1df4d58de5beef9e77ba353f605728fd7 100644 (file)
@@ -515,7 +515,7 @@ groups:
         annotations:
           description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
           summary: "MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }}"
-        expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
+        expr: "(node_network_mtu_bytes{device!=\"lo\"} * (node_network_up{device!=\"lo\"} > 0))!=on(cluster, device)group_left()quantile by (cluster, device) (0.5,node_network_mtu_bytes{device!=\"lo\"} * (node_network_up{device!=\"lo\"} > 0))"
         labels:
           severity: "warning"
           type: "ceph_default"
index 1184aa084db1e3a87e7f7585124cb665ffdfe54c..833d481d72cf8289e6d74b180cf53fa446c114e8 100644 (file)
@@ -146,7 +146,7 @@ tests:
        exp_samples:
          - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
             job="ceph",cluster="mycluster"}'
-           value: 1.2200000000000001E+01
+           value: 1.21E+01
    alert_rule_test:
      - eval_time: 5m
        alertname: CephOSDFlapping
@@ -508,77 +508,116 @@ tests:
  # MTU Mismatch
  - interval: 1m
    input_series:
-    - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    # Test 1: All MTUs match (NO alert)
+    - series: 'node_network_mtu_bytes{device="eth0",instance="host1",cluster="c"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_mtu_bytes{device="eth0",instance="host2",cluster="c"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
-      values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
-      values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth0",instance="host1",cluster="c"}'
+      values: '1 1 1 1 1'
+    - series: 'node_network_up{device="eth0",instance="host2",cluster="c"}'
+      values: '1 1 1 1 1'
+
+    # Test 2: One host has different MTU
+    - series: 'node_network_mtu_bytes{device="eth1",instance="host1",cluster="c"}'
       values: '9000 9000 9000 9000 9000'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
-      values: '2200 2200 2200 2200 2200'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
-      values: '2400 2400 2400 2400 2400'
-    - series: 'node_network_up{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
-      values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
-      values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_mtu_bytes{device="eth1",instance="host2",cluster="c"}'
+      values: '9000 9000 9000 9000 9000'
+    - series: 'node_network_mtu_bytes{device="eth1",instance="host3",cluster="c"}'
+      values: '9200 9200 9200 9200 9200'
+    - series: 'node_network_up{device="eth1",instance="host1",cluster="c"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth1",instance="host2",cluster="c"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth1",instance="host3",cluster="c"}'
+      values: '1 1 1 1 1'
+
+    # Test 3: Down host should not alert
+    - series: 'node_network_mtu_bytes{device="eth2",instance="host1",cluster="c"}'
+      values: '2400 2400 2400 2400 2400'
+    - series: 'node_network_mtu_bytes{device="eth2",instance="host2",cluster="c"}'
+      values: '2400 2400 2400 2400 2400'
+    - series: 'node_network_mtu_bytes{device="eth2",instance="host3",cluster="c"}'
+      values: '9000 9000 9000 9000 9000'
+    - series: 'node_network_up{device="eth2",instance="host1",cluster="c"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth2",instance="host2",cluster="c"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth2",instance="host3",cluster="c"}'
       values: '0 0 0 0 0'
+
+    # Test 4: All different MTUs, odd count, median = 2400
+    - series: 'node_network_mtu_bytes{device="eth3",instance="host1",cluster="c"}'
+      values: '1500 1500 1500 1500 1500'
+    - series: 'node_network_mtu_bytes{device="eth3",instance="host2",cluster="c"}'
+      values: '2400 2400 2400 2400 2400'
+    - series: 'node_network_mtu_bytes{device="eth3",instance="host3",cluster="c"}'
+      values: '9000 9000 9000 9000 9000'
+    - series: 'node_network_up{device="eth3",instance="host1",cluster="c"}'
+      values: '1 1 1 1 1'
+    - series: 'node_network_up{device="eth3",instance="host2",cluster="c"}'
+      values: '1 1 1 1 1'
+    - series: 'node_network_up{device="eth3",instance="host3",cluster="c"}'
+      values: '1 1 1 1 1'
+
    promql_expr_test:
-     - expr: |
-          node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
-            scalar(
-              max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
-            )
-          or
-          node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
-            scalar(
-              min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
-            )
-       eval_time: 1m
-       exp_samples:
-         - labels: '{device="eth4", instance="node-exporter", job="node-exporter", cluster="mycluster"}'
-           value: 9000
-         - labels: '{device="eth4", instance="hostname1", job="node-exporter", cluster="mycluster"}'
-           value: 2200
+    - expr: |
+        (
+          node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0)
+        )
+        !=
+        on(cluster, device)
+        group_left()
+        quantile by (cluster, device) (
+          0.5,
+          node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0)
+        )
+      eval_time: 1m
+      exp_samples:
+        # eth1: host3 deviates from 9000 (median)
+        - labels: '{device="eth1", instance="host3", cluster="c"}'
+          value: 9200
+
+        # eth3: median is 2400, host1 and host3 differ
+        - labels: '{device="eth3", instance="host1", cluster="c"}'
+          value: 1500
+        - labels: '{device="eth3", instance="host3", cluster="c"}'
+          value: 9000
+
    alert_rule_test:
-     - eval_time: 1m
-       alertname: CephNodeInconsistentMTU
-       exp_alerts:
-       - exp_labels:
-           device: eth4
-           instance: hostname1
-           job: node-exporter
-           severity: warning
-           type: ceph_default
-           cluster: "mycluster"
-         exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
-           description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
-       - exp_labels:
-           device: eth4
-           instance: node-exporter
-           job: node-exporter
-           severity: warning
-           type: ceph_default
-           cluster: "mycluster"
-         exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
-           description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
+    - eval_time: 1m
+      alertname: CephNodeInconsistentMTU
+      exp_alerts:
+        # Test 2 - host3 has wrong MTU
+        - exp_labels:
+            device: eth1
+            instance: host3
+            cluster: c
+            severity: warning
+            type: ceph_default
+          exp_annotations:
+            summary: MTU settings across Ceph hosts are inconsistent on cluster c
+            description: "Node host3 has a different MTU size (9200) than the median of devices named eth1."
+
+        # Test 4 - host1 and host3 deviate from median 2400
+        - exp_labels:
+            device: eth3
+            instance: host1
+            cluster: c
+            severity: warning
+            type: ceph_default
+          exp_annotations:
+            summary: MTU settings across Ceph hosts are inconsistent on cluster c
+            description: "Node host1 has a different MTU size (1500) than the median of devices named eth3."
+        - exp_labels:
+            device: eth3
+            instance: host3
+            cluster: c
+            severity: warning
+            type: ceph_default
+          exp_annotations:
+            summary: MTU settings across Ceph hosts are inconsistent on cluster c
+            description: "Node host3 has a different MTU size (9000) than the median of devices named eth3."
 
  # pool full, data series has 6 but using topk(5) so to ensure the
  # results are working as expected