From: Christian Kugler Date: Sun, 16 Oct 2022 17:21:01 +0000 (+0200) Subject: ceph-mixin: Add Prometheus Alert for Degraded Bond X-Git-Tag: v18.1.0~917^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F48538%2Fhead;p=ceph.git ceph-mixin: Add Prometheus Alert for Degraded Bond Currently there is no alert for a network interface card to be misconfigured or failed which is part of a network bond. This could lead to redundancies and performance being degraded unnoticed. To solve this, I use node exporter metrics to look at the number of total peers of the bond and the ones that are active. If the numbers differ, something is up and should be looked at. Fixes: https://tracker.ceph.com/issues/57962 Signed-off-by: Christian Kugler --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index bed89a879064..d56628fb2d75 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -551,6 +551,17 @@ description: 'Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}.', }, }, + { + alert: 'CephNodeNetworkBondDegraded', + expr: ||| + node_bonding_slaves - node_bonding_active != 0 + |||, + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'Degraded Bond on Node {{ $labels.instance }}%(cluster)s' % $.MultiClusterSummary(), + description: 'Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}.', + }, + }, { alert: 'CephNodeDiskspaceWarning', expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0', diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index a544d41eb0ee..47fdd43a81b7 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -495,6 +495,15 @@ groups: oid: "1.3.6.1.4.1.50495.1.2.1.8.3" severity: "warning" type: "ceph_default" + - alert: "CephNodeNetworkBondDegraded" + annotations: + summary: "Degraded Bond on Node {{ $labels.instance }}" + description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}." + expr: | + node_bonding_slaves - node_bonding_active != 0 + labels: + severity: "warning" + type: "ceph_default" - alert: "CephNodeDiskspaceWarning" annotations: description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate." diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 7b7e7db7301b..8902d9c1493a 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -470,6 +470,37 @@ tests: summary: One or more NICs reports packet errors description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0." + # Bond is missing a peer + - interval: 1m + input_series: + - series: 'node_bonding_active{master="bond0", + instance="node-exporter",job="node-exporter"}' + values: '3' + - series: 'node_bonding_slaves{master="bond0", + instance="node-exporter",job="node-exporter"}' + values: '4' + promql_expr_test: + - expr: | + node_bonding_slaves - node_bonding_active != 0 + eval_time: 5m + exp_samples: + - labels: '{master="bond0", instance="node-exporter", + job="node-exporter"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: CephNodeNetworkBondDegraded + exp_alerts: + - exp_labels: + master: bond0 + instance: node-exporter + job: node-exporter + severity: warning + type: ceph_default + exp_annotations: + summary: Degraded Bond on Node node-exporter + description: "Bond bond0 is degraded on Node node-exporter." + # Node Storage disk space filling up - interval: 1m # 20GB = 21474836480, 256MB = 268435456