]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
monitoring: fix MTU Mismatch alert rule and expr
authorAashish Sharma <Aashish.Sharma1@ibm.com>
Wed, 2 Jul 2025 11:05:14 +0000 (16:35 +0530)
committerAashish Sharma <Aashish.Sharma1@ibm.com>
Mon, 29 Sep 2025 08:05:49 +0000 (13:35 +0530)
Fixes: https://tracker.ceph.com/issues/73290
Signed-off-by: Aashish Sharma <aasharma@redhat.com>
(cherry picked from commit bee24dec441b9e6b263e4498c2ab333b0a60a52d)

Conflicts:
src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html
src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts

monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml
src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html
src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts
src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html
src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss
src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts

index 5f840d93a98131b1894aed3663f0fd4e48aae950..e58072b9bcfbf38e5b4020ccb698f6bb291a484f 100644 (file)
@@ -511,14 +511,38 @@ groups:
           oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
           severity: "warning"
           type: "ceph_default"
-      - alert: "CephNodeInconsistentMTU"
-        annotations:
-          description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
-          summary: "MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }}"
-        expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
+      - alert: CephNodeInconsistentMTU
+        expr: |
+          node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
+          != on (cluster, device) group_left
+            quantile by (cluster, device) (
+              0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
+            )
         labels:
-          severity: "warning"
-          type: "ceph_default"
+          severity: warning
+          type: ceph_default
+        annotations:
+          summary: "Node {{ $labels.instance }} has inconsistent MTU settings in cluster {{ $labels.cluster }}"
+          description: "Network interface {{ $labels.device }} on node {{ $labels.instance }} has MTU {{ $value }} which differs from the cluster median."
+          impact: |
+            - May cause packet fragmentation or packet drops
+            - Risk of degraded cluster communication and performance
+            - Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes)
+          fix: |
+            - Check the MTU of interface `{{ $labels.device }}` on node `{{ $labels.instance }}`:
+              ip link show {{ $labels.device }}
+
+            - Find the median MTU value across the cluster by running this PromQL query in Prometheus:
+              quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+
+            - Standardize MTU across all nodes to match the median (commonly 1500 or 9000):
+              ip link set dev {{ $labels.device }} mtu <median-value>
+
+            - Make MTU setting persistent:
+              - RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-<device>`
+              - Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply`
+
+            - Restart the affected interface or node if required.
   - name: "pools"
     rules:
       - alert: "CephPoolGrowthWarning"
index fdb892ff05718e00cb78441d833820909a43bb93..0beb1da6fddbf3a5864c076d1ae6104cef93827a 100644 (file)
@@ -505,80 +505,83 @@ tests:
            summary: Host filesystem free space is getting low on cluster mycluster
            description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
 
- # MTU Mismatch
+# MTU Mismatch
  - interval: 1m
    input_series:
-    - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_mtu_bytes{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_mtu_bytes{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_mtu_bytes{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_mtu_bytes{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+
+    - series: 'node_network_mtu_bytes{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
+      values: '1500 1500 1500 1500 1500'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}'
       values: '9000 9000 9000 9000 9000'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
-      values: '2200 2200 2200 2200 2200'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
-      values: '2400 2400 2400 2400 2400'
-    - series: 'node_network_up{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
-      values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
-      values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}'
+      values: '9000 9000 9000 9000 9000'
+
+    - series: 'node_network_up{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
+    - series: 'node_network_up{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}'
+      values: '1 1 1 1 1'
+    - series: 'node_network_up{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
+      values: '1 1 1 1 1'
+    - series: 'node_network_up{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}'
+      values: '1 1 1 1 1'
+    - series: 'node_network_up{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
-      values: '0 0 0 0 0'
    promql_expr_test:
      - expr: |
-          node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
-            scalar(
-              max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
-            )
-          or
-          node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
-            scalar(
-              min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
-            )
+         node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
+         != on (cluster, device) group_left
+          quantile by (cluster, device) (
+            0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)
+          )
        eval_time: 1m
        exp_samples:
-         - labels: '{device="eth4", instance="node-exporter", job="node-exporter", cluster="mycluster"}'
-           value: 9000
-         - labels: '{device="eth4", instance="hostname1", job="node-exporter", cluster="mycluster"}'
-           value: 2200
+         - labels: '{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}'
+           value: 1500
    alert_rule_test:
      - eval_time: 1m
        alertname: CephNodeInconsistentMTU
        exp_alerts:
        - exp_labels:
            device: eth4
-           instance: hostname1
+           instance: node1
            job: node-exporter
            severity: warning
            type: ceph_default
            cluster: "mycluster"
          exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
-           description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
-       - exp_labels:
-           device: eth4
-           instance: node-exporter
-           job: node-exporter
-           severity: warning
-           type: ceph_default
-           cluster: "mycluster"
-         exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
-           description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
+           summary: "Node node1 has inconsistent MTU settings in cluster mycluster"
+           description: "Network interface eth4 on node node1 has MTU 1500 which differs from the cluster median."
+           impact: |
+             - May cause packet fragmentation or packet drops
+             - Risk of degraded cluster communication and performance
+             - Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes)
+           fix: |
+             - Check the MTU of interface `eth4` on node `node1`:
+               ip link show eth4
+
+             - Find the median MTU value across the cluster by running this PromQL query in Prometheus:
+               quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+
+             - Standardize MTU across all nodes to match the median (commonly 1500 or 9000):
+               ip link set dev eth4 mtu <median-value>
+
+             - Make MTU setting persistent:
+               - RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-<device>`
+               - Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply`
+
+             - Restart the affected interface or node if required.
 
  # pool full, data series has 6 but using topk(5) so to ensure the
  # results are working as expected
index c726ea319ed6bd7a803197bbdf76c6afae340d83..700f747a90ed68ae1cbc6bb14f06bbdbbb608989 100644 (file)
@@ -29,7 +29,9 @@
                         [appendParentKey]="false"
                         [data]="expandedRow"
                         [customCss]="customCss"
-                        [autoReload]="false">
+                        [autoReload]="false"
+                        [showMultiLineText]="true"
+                        [multilineTextKeys]="multilineTextKeys">
     </cd-table-key-value>
   </ng-container>
 </cd-table>
index e3892f0a67942b28eae4cc6307a94cac6e797a3e..95071f88c50973a3480ccfaa0b0b4ce228b25cd8 100644 (file)
@@ -28,6 +28,7 @@ export class ActiveAlertListComponent extends PrometheusListHelper implements On
   permission: Permission;
   selection = new CdTableSelection();
   icons = Icons;
+  multilineTextKeys = ['description', 'impact', 'fix'];
 
   constructor(
     // NotificationsComponent will refresh all alerts every 5s (No need to do it here as well)
index 4dc625459d5b0ec68fd9cd419b208c2348ff029b..4dbaf63f32d4292560cb399b5773f6b1ccae8a2f 100644 (file)
             [limit]="0">
   </cd-table>
 </div>
+
+<ng-template #valueCellTpl
+             let-row="data.row"
+             let-value="data.value">
+  <span
+    class="pre-wrap"
+    *ngIf="row.key in multilineTextKeys; else normalText"
+  >{{ value }}</span>
+  <ng-template #normalText>
+    <span>{{ value }}</span>
+  </ng-template>
+</ng-template>
index 0f450ce2a57c84a8409b95e4c698368c746e019a..e40ad1134882c945da2681262f76b1dff3db274d 100644 (file)
@@ -5,6 +5,7 @@ import {
   OnChanges,
   OnInit,
   Output,
+  TemplateRef,
   ViewChild
 } from '@angular/core';
 
@@ -36,6 +37,8 @@ interface KeyValueItem {
 export class TableKeyValueComponent implements OnInit, OnChanges {
   @ViewChild(TableComponent, { static: true })
   table: TableComponent;
+  @ViewChild('valueCellTpl', { static: true })
+  valueCellTpl: TemplateRef<any>;
 
   @Input()
   data: any;
@@ -50,6 +53,10 @@ export class TableKeyValueComponent implements OnInit, OnChanges {
   hideEmpty = false;
   @Input()
   hideKeys: string[] = []; // Keys of pairs not to be displayed
+  @Input()
+  showMultiLineText = false; // If true, the value field will use a template that supports multi line text
+  @Input()
+  multilineTextKeys: string[]; // If set, the value field will use a template that supports multi line text for this key
 
   // If set, the classAddingTpl is used to enable different css for different values
   @Input()
@@ -81,6 +88,9 @@ export class TableKeyValueComponent implements OnInit, OnChanges {
     if (this.customCss) {
       this.columns[1].cellTransformation = CellTemplate.classAdding;
     }
+    if (this.showMultiLineText) {
+      this.columns[1].cellTemplate = this.valueCellTpl;
+    }
     // We need to subscribe the 'fetchData' event here and not in the
     // HTML template, otherwise the data table will display the loading
     // indicator infinitely if data is only bound via '[data]="xyz"'.