]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/dashboard: prometheus alerting: add some leeway for package drops and errors...
authorPatrick Seidensal <pseidensal@suse.com>
Wed, 11 Nov 2020 17:55:30 +0000 (18:55 +0100)
committerPatrick Seidensal <pseidensal@suse.com>
Tue, 16 Feb 2021 13:43:00 +0000 (14:43 +0100)
Fixes: https://tracker.ceph.com/issues/48201
Signed-off-by: Patrick Seidensal <pseidensal@suse.com>
monitoring/prometheus/alerts/ceph_default_alerts.yml

index b14eb15460ccc7ee17739c1ac005fb50b8f81561..f7e8ce1188e0ad02f0ca033c102bc7561faa93c0 100644 (file)
@@ -175,30 +175,48 @@ groups:
           description: >
             Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
 
-      # alert on nic packet errors and drops rates > 1 packet/s
+      # alert on nic packet errors and drops rates > 1% packets/s
       - alert: network packets dropped
-        expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
+        expr: |
+          (
+            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) / (
+            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) >= 10
         labels:
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.8.2
         annotations:
           description: >
-            Node {{ $labels.instance }} experiences packet drop > 1
-            packet/s on interface {{ $labels.device }}.
+            Node {{ $labels.instance }} experiences packet drop > 0.01% or >
+            10 packets/s on interface {{ $labels.device }}.
 
       - alert: network packet errors
         expr: |
-          irate(node_network_receive_errs_total{device!="lo"}[5m]) +
-          irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
+          (
+            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) / (
+            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) >= 10
         labels:
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.8.3
         annotations:
           description: >
-            Node {{ $labels.instance }} experiences packet errors > 1
-            packet/s on interface {{ $labels.device }}.
+            Node {{ $labels.instance }} experiences packet errors > 0.01% or
+            > 10 packets/s on interface {{ $labels.device }}.
 
       - alert: storage filling up
         expr: |