From bee24dec441b9e6b263e4498c2ab333b0a60a52d Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Wed, 2 Jul 2025 16:35:14 +0530 Subject: [PATCH] monitoring: fix MTU Mismatch alert rule and expr Fixes: https://tracker.ceph.com/issues/73290 Signed-off-by: Aashish Sharma --- monitoring/ceph-mixin/prometheus_alerts.yml | 38 +++++-- .../ceph-mixin/tests_alerts/test_alerts.yml | 99 ++++++++++--------- .../active-alert-list.component.html | 4 + .../active-alert-list.component.ts | 1 + .../table-key-value.component.html | 12 +++ .../table-key-value.component.scss | 3 + .../table-key-value.component.ts | 10 ++ 7 files changed, 112 insertions(+), 55 deletions(-) diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 5f840d93a9813..e58072b9bcfbf 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -511,14 +511,38 @@ groups: oid: "1.3.6.1.4.1.50495.1.2.1.8.4" severity: "warning" type: "ceph_default" - - alert: "CephNodeInconsistentMTU" - annotations: - description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}." - summary: "MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }}" - expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )" + - alert: CephNodeInconsistentMTU + expr: | + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + != on (cluster, device) group_left + quantile by (cluster, device) ( + 0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + ) labels: - severity: "warning" - type: "ceph_default" + severity: warning + type: ceph_default + annotations: + summary: "Node {{ $labels.instance }} has inconsistent MTU settings in cluster {{ $labels.cluster }}" + description: "Network interface {{ $labels.device }} on node {{ $labels.instance }} has MTU {{ $value }} which differs from the cluster median." + impact: | + - May cause packet fragmentation or packet drops + - Risk of degraded cluster communication and performance + - Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes) + fix: | + - Check the MTU of interface `{{ $labels.device }}` on node `{{ $labels.instance }}`: + ip link show {{ $labels.device }} + + - Find the median MTU value across the cluster by running this PromQL query in Prometheus: + quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + + - Standardize MTU across all nodes to match the median (commonly 1500 or 9000): + ip link set dev {{ $labels.device }} mtu + + - Make MTU setting persistent: + - RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-` + - Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply` + + - Restart the affected interface or node if required. - name: "pools" rules: - alert: "CephPoolGrowthWarning" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index fdb892ff05718..0beb1da6fddbf 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -505,80 +505,83 @@ tests: summary: Host filesystem free space is getting low on cluster mycluster description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate." - # MTU Mismatch +# MTU Mismatch - interval: 1m input_series: - - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_mtu_bytes{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_mtu_bytes{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_mtu_bytes{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_mtu_bytes{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + + - series: 'node_network_mtu_bytes{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}' + values: '1500 1500 1500 1500 1500' + - series: 'node_network_mtu_bytes{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}' values: '9000 9000 9000 9000 9000' - - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}' - values: '2200 2200 2200 2200 2200' - - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}' - values: '2400 2400 2400 2400 2400' - - series: 'node_network_up{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}' - values: '0 0 0 0 0' - - series: 'node_network_up{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}' - values: '0 0 0 0 0' - - series: 'node_network_up{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_mtu_bytes{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}' + values: '9000 9000 9000 9000 9000' + + - series: 'node_network_up{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_up{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_up{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}' + - series: 'node_network_up{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}' - values: '0 0 0 0 0' promql_expr_test: - expr: | - node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == - scalar( - max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != - quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) - ) - or - node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == - scalar( - min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != - quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) - ) + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + != on (cluster, device) group_left + quantile by (cluster, device) ( + 0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + ) eval_time: 1m exp_samples: - - labels: '{device="eth4", instance="node-exporter", job="node-exporter", cluster="mycluster"}' - value: 9000 - - labels: '{device="eth4", instance="hostname1", job="node-exporter", cluster="mycluster"}' - value: 2200 + - labels: '{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}' + value: 1500 alert_rule_test: - eval_time: 1m alertname: CephNodeInconsistentMTU exp_alerts: - exp_labels: device: eth4 - instance: hostname1 + instance: node1 job: node-exporter severity: warning type: ceph_default cluster: "mycluster" exp_annotations: - summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster - description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4." - - exp_labels: - device: eth4 - instance: node-exporter - job: node-exporter - severity: warning - type: ceph_default - cluster: "mycluster" - exp_annotations: - summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster - description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4." + summary: "Node node1 has inconsistent MTU settings in cluster mycluster" + description: "Network interface eth4 on node node1 has MTU 1500 which differs from the cluster median." + impact: | + - May cause packet fragmentation or packet drops + - Risk of degraded cluster communication and performance + - Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes) + fix: | + - Check the MTU of interface `eth4` on node `node1`: + ip link show eth4 + + - Find the median MTU value across the cluster by running this PromQL query in Prometheus: + quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + + - Standardize MTU across all nodes to match the median (commonly 1500 or 9000): + ip link set dev eth4 mtu + + - Make MTU setting persistent: + - RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-` + - Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply` + + - Restart the affected interface or node if required. # pool full, data series has 6 but using topk(5) so to ensure the # results are working as expected diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html index b3ba03caed840..52bd8b9b5c22f 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html @@ -36,6 +36,8 @@ [data]="expandedRow" [customCss]="customCss" [autoReload]="false" + [showMultiLineText]="true" + [multilineTextKeys]="multilineTextKeys" > } @else if (expandedRow?.alert_count > 1) { @@ -60,6 +62,8 @@ [data]="expandedInnerRow" [customCss]="customCss" [autoReload]="false" + [showMultiLineText]="true" + [multilineTextKeys]="multilineTextKeys" > } diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts index cc12b29bfcab4..3266109490fc8 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts @@ -30,6 +30,7 @@ export class ActiveAlertListComponent extends PrometheusListHelper implements On selection = new CdTableSelection(); icons = Icons; expandedInnerRow: any; + multilineTextKeys = ['description', 'impact', 'fix']; constructor( // NotificationsComponent will refresh all alerts every 5s (No need to do it here as well) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html index 4dc625459d5b0..4dbaf63f32d42 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html @@ -12,3 +12,15 @@ [limit]="0"> + + + {{ value }} + + {{ value }} + + diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss index e69de29bb2d1d..962da03ac106e 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss @@ -0,0 +1,3 @@ +.pre-wrap { + white-space: pre-wrap; +} diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts index 0f450ce2a57c8..e40ad1134882c 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts @@ -5,6 +5,7 @@ import { OnChanges, OnInit, Output, + TemplateRef, ViewChild } from '@angular/core'; @@ -36,6 +37,8 @@ interface KeyValueItem { export class TableKeyValueComponent implements OnInit, OnChanges { @ViewChild(TableComponent, { static: true }) table: TableComponent; + @ViewChild('valueCellTpl', { static: true }) + valueCellTpl: TemplateRef; @Input() data: any; @@ -50,6 +53,10 @@ export class TableKeyValueComponent implements OnInit, OnChanges { hideEmpty = false; @Input() hideKeys: string[] = []; // Keys of pairs not to be displayed + @Input() + showMultiLineText = false; // If true, the value field will use a template that supports multi line text + @Input() + multilineTextKeys: string[]; // If set, the value field will use a template that supports multi line text for this key // If set, the classAddingTpl is used to enable different css for different values @Input() @@ -81,6 +88,9 @@ export class TableKeyValueComponent implements OnInit, OnChanges { if (this.customCss) { this.columns[1].cellTransformation = CellTemplate.classAdding; } + if (this.showMultiLineText) { + this.columns[1].cellTemplate = this.valueCellTpl; + } // We need to subscribe the 'fetchData' event here and not in the // HTML template, otherwise the data table will display the loading // indicator infinitely if data is only bound via '[data]="xyz"'. -- 2.39.5