From eb83409f5f7d1467a5e9738b109ba0fc686e73da Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Wed, 2 Jul 2025 16:35:14 +0530 Subject: [PATCH] monitoring: fix MTU Mismatch alert rule and expr Fixes: https://tracker.ceph.com/issues/73290 Signed-off-by: Aashish Sharma (cherry picked from commit bee24dec441b9e6b263e4498c2ab333b0a60a52d) Conflicts: monitoring/ceph-mixin/prometheus_alerts.yml monitoring/ceph-mixin/tests_alerts/test_alerts.yml src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss --- monitoring/ceph-mixin/prometheus_alerts.yml | 38 ++++-- .../ceph-mixin/tests_alerts/test_alerts.yml | 110 ++++++++---------- .../active-alert-list.component.html | 4 +- .../active-alert-list.component.ts | 1 + .../table-key-value.component.html | 12 ++ .../table-key-value.component.scss | 4 + .../table-key-value.component.ts | 10 ++ 7 files changed, 111 insertions(+), 68 deletions(-) diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 84452e5845a..ba06f01f1f9 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -513,14 +513,38 @@ groups: oid: "1.3.6.1.4.1.50495.1.2.1.8.4" severity: "warning" type: "ceph_default" - - alert: "CephNodeInconsistentMTU" - annotations: - description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}." - summary: "MTU settings across Ceph hosts are inconsistent" - expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )" + - alert: CephNodeInconsistentMTU + expr: | + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + != on (cluster, device) group_left + quantile by (cluster, device) ( + 0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + ) labels: - severity: "warning" - type: "ceph_default" + severity: warning + type: ceph_default + annotations: + summary: "Node {{ $labels.instance }} has inconsistent MTU settings in cluster {{ $labels.cluster }}" + description: "Network interface {{ $labels.device }} on node {{ $labels.instance }} has MTU {{ $value }} which differs from the cluster median." + impact: | + - May cause packet fragmentation or packet drops + - Risk of degraded cluster communication and performance + - Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes) + fix: | + - Check the MTU of interface `{{ $labels.device }}` on node `{{ $labels.instance }}`: + ip link show {{ $labels.device }} + + - Find the median MTU value across the cluster by running this PromQL query in Prometheus: + quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + + - Standardize MTU across all nodes to match the median (commonly 1500 or 9000): + ip link set dev {{ $labels.device }} mtu + + - Make MTU setting persistent: + - RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-` + - Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply` + + - Restart the affected interface or node if required. - name: "pools" rules: - alert: "CephPoolGrowthWarning" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 40d6f4d0983..0efc3c9ad24 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -543,90 +543,80 @@ tests: # MTU Mismatch - interval: 1m input_series: - - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter", - job="node-exporter"}' + - series: 'node_network_mtu_bytes{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter", - job="node-exporter"}' + - series: 'node_network_mtu_bytes{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter", - job="node-exporter"}' + - series: 'node_network_mtu_bytes{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter", - job="node-exporter"}' + - series: 'node_network_mtu_bytes{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1500 1500 1500 1500 1500' - - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter", - job="node-exporter"}' + + - series: 'node_network_mtu_bytes{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}' + values: '1500 1500 1500 1500 1500' + - series: 'node_network_mtu_bytes{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}' values: '9000 9000 9000 9000 9000' - - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1", - job="node-exporter"}' - values: '2200 2200 2200 2200 2200' - - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2", - job="node-exporter"}' - values: '2400 2400 2400 2400 2400' - - series: 'node_network_up{device="eth0",instance="node-exporter", - job="node-exporter"}' - values: '0 0 0 0 0' - - series: 'node_network_up{device="eth1",instance="node-exporter", - job="node-exporter"}' - values: '0 0 0 0 0' - - series: 'node_network_up{device="eth2",instance="node-exporter", - job="node-exporter"}' + - series: 'node_network_mtu_bytes{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}' + values: '9000 9000 9000 9000 9000' + + - series: 'node_network_up{device="eth0",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth3",instance="node-exporter", - job="node-exporter"}' + - series: 'node_network_up{device="eth1",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth4",instance="node-exporter", - job="node-exporter"}' + - series: 'node_network_up{device="eth2",instance="node1",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth4",instance="hostname1", - job="node-exporter"}' + - series: 'node_network_up{device="eth3",instance="node1",job="node-exporter",cluster="mycluster"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="node2",job="node-exporter",cluster="mycluster"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="node3",job="node-exporter",cluster="mycluster"}' values: '1 1 1 1 1' - - series: 'node_network_up{device="eth4",instance="hostname2", - job="node-exporter"}' - values: '0 0 0 0 0' promql_expr_test: - expr: | - node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == - scalar( - max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != - quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) - ) - or - node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == - scalar( - min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != - quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) - ) + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + != on (cluster, device) group_left + quantile by (cluster, device) ( + 0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) + ) eval_time: 1m exp_samples: - - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}' - value: 9000 - - labels: '{device="eth4", instance="hostname1", job="node-exporter"}' - value: 2200 + - labels: '{device="eth4",instance="node1",job="node-exporter",cluster="mycluster"}' + value: 1500 alert_rule_test: - eval_time: 1m alertname: CephNodeInconsistentMTU exp_alerts: - exp_labels: device: eth4 - instance: hostname1 + instance: node1 job: node-exporter severity: warning type: ceph_default + cluster: "mycluster" exp_annotations: - summary: MTU settings across Ceph hosts are inconsistent - description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4." - - exp_labels: - device: eth4 - instance: node-exporter - job: node-exporter - severity: warning - type: ceph_default - exp_annotations: - summary: MTU settings across Ceph hosts are inconsistent - description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4." + summary: "Node node1 has inconsistent MTU settings in cluster mycluster" + description: "Network interface eth4 on node node1 has MTU 1500 which differs from the cluster median." + impact: | + - May cause packet fragmentation or packet drops + - Risk of degraded cluster communication and performance + - Potential instability in services relying on consistent networking (e.g., Ceph, Kubernetes) + fix: | + - Check the MTU of interface `eth4` on node `node1`: + ip link show eth4 + + - Find the median MTU value across the cluster by running this PromQL query in Prometheus: + quantile by (cluster, device) (0.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + + - Standardize MTU across all nodes to match the median (commonly 1500 or 9000): + ip link set dev eth4 mtu + + - Make MTU setting persistent: + - RHEL/CentOS: edit `/etc/sysconfig/network-scripts/ifcfg-` + - Debian/Ubuntu: edit `/etc/netplan/*.yaml` and apply with `netplan apply` + - Restart the affected interface or node if required. # pool full, data series has 6 but using topk(5) so to ensure the # results are working as expected - interval: 1m diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html index 278bc4ddc46..dcffc7b85a6 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.html @@ -29,7 +29,9 @@ [appendParentKey]="false" [data]="expandedRow" [customCss]="customCss" - [autoReload]="false"> + [autoReload]="false" + [showMultiLineText]="true" + [multilineTextKeys]="multilineTextKeys"> diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts index de027bfec50..348637179f9 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/active-alert-list/active-alert-list.component.ts @@ -28,6 +28,7 @@ export class ActiveAlertListComponent extends PrometheusListHelper implements On permission: Permission; selection = new CdTableSelection(); icons = Icons; + multilineTextKeys = ['description', 'impact', 'fix']; constructor( // NotificationsComponent will refresh all alerts every 5s (No need to do it here as well) diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html index b022f1551e8..3698d30a405 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html @@ -12,3 +12,15 @@ [limit]="0"> + + + {{ value }} + + {{ value }} + + diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss index f8d8745d44d..facc94954a8 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.scss @@ -3,3 +3,7 @@ max-height: 40vh; overflow: auto; } + +.pre-wrap { + white-space: pre-wrap; +} diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts index 0f450ce2a57..e40ad113488 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.ts @@ -5,6 +5,7 @@ import { OnChanges, OnInit, Output, + TemplateRef, ViewChild } from '@angular/core'; @@ -36,6 +37,8 @@ interface KeyValueItem { export class TableKeyValueComponent implements OnInit, OnChanges { @ViewChild(TableComponent, { static: true }) table: TableComponent; + @ViewChild('valueCellTpl', { static: true }) + valueCellTpl: TemplateRef; @Input() data: any; @@ -50,6 +53,10 @@ export class TableKeyValueComponent implements OnInit, OnChanges { hideEmpty = false; @Input() hideKeys: string[] = []; // Keys of pairs not to be displayed + @Input() + showMultiLineText = false; // If true, the value field will use a template that supports multi line text + @Input() + multilineTextKeys: string[]; // If set, the value field will use a template that supports multi line text for this key // If set, the classAddingTpl is used to enable different css for different values @Input() @@ -81,6 +88,9 @@ export class TableKeyValueComponent implements OnInit, OnChanges { if (this.customCss) { this.columns[1].cellTransformation = CellTemplate.classAdding; } + if (this.showMultiLineText) { + this.columns[1].cellTemplate = this.valueCellTpl; + } // We need to subscribe the 'fetchData' event here and not in the // HTML template, otherwise the data table will display the loading // indicator infinitely if data is only bound via '[data]="xyz"'. -- 2.39.5