]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-mixin: Add Prometheus Alert for Degraded Bond 48538/head
authorChristian Kugler <syphdias+git@gmail.com>
Sun, 16 Oct 2022 17:21:01 +0000 (19:21 +0200)
committerChristian Kugler <syphdias+git@gmail.com>
Wed, 2 Nov 2022 13:48:57 +0000 (14:48 +0100)
Currently there is no alert for a network interface card to be misconfigured or
failed which is part of a network bond.

This could lead to redundancies and performance being degraded unnoticed.

To solve this, I use node exporter metrics to look at the number of total peers
of the bond and the ones that are active. If the numbers differ, something is up
and should be looked at.

Fixes: https://tracker.ceph.com/issues/57962
Signed-off-by: Christian Kugler <syphdias+git@gmail.com>
monitoring/ceph-mixin/prometheus_alerts.libsonnet
monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index bed89a8790648e7d10b70f31db33938783c67fd2..d56628fb2d75c054b1986b6fe9042cd1a6fe8b40 100644 (file)
             description: 'Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}.',
           },
         },
+        {
+          alert: 'CephNodeNetworkBondDegraded',
+          expr: |||
+            node_bonding_slaves - node_bonding_active != 0
+          |||,
+          labels: { severity: 'warning', type: 'ceph_default' },
+          annotations: {
+            summary: 'Degraded Bond on Node {{ $labels.instance }}%(cluster)s' % $.MultiClusterSummary(),
+            description: 'Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}.',
+          },
+        },
         {
           alert: 'CephNodeDiskspaceWarning',
           expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0',
index a544d41eb0ee0ac15806a52c93ea5c36856c20cf..47fdd43a81b732dfb6f8be98b44ba8d7f825e215 100644 (file)
@@ -495,6 +495,15 @@ groups:
           oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
           severity: "warning"
           type: "ceph_default"
+      - alert: "CephNodeNetworkBondDegraded"
+        annotations:
+          summary: "Degraded Bond on Node {{ $labels.instance }}"
+          description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
+        expr: |
+          node_bonding_slaves - node_bonding_active != 0
+        labels:
+          severity: "warning"
+          type: "ceph_default"
       - alert: "CephNodeDiskspaceWarning"
         annotations:
           description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
index 7b7e7db7301bddc9f0d55e2ca2b56198fb89652c..8902d9c1493a36a196e8e055687616cb59ec8377 100644 (file)
@@ -470,6 +470,37 @@ tests:
            summary: One or more NICs reports packet errors
            description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
 
+ # Bond is missing a peer
+ - interval: 1m
+   input_series:
+    - series: 'node_bonding_active{master="bond0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '3'
+    - series: 'node_bonding_slaves{master="bond0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '4'
+   promql_expr_test:
+     - expr: |
+         node_bonding_slaves - node_bonding_active != 0
+       eval_time: 5m
+       exp_samples:
+         - labels: '{master="bond0", instance="node-exporter",
+           job="node-exporter"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 5m
+       alertname: CephNodeNetworkBondDegraded
+       exp_alerts:
+       - exp_labels:
+           master: bond0
+           instance: node-exporter
+           job: node-exporter
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           summary: Degraded Bond on Node node-exporter
+           description: "Bond bond0 is degraded on Node node-exporter."
+
 # Node Storage disk space filling up
  - interval: 1m
    # 20GB = 21474836480, 256MB = 268435456