]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/dashboard: prometheus alerting: add some leeway for package drops and errors... 39508/head
authorPatrick Seidensal <pseidensal@suse.com>
Wed, 11 Nov 2020 17:55:30 +0000 (18:55 +0100)
committerKonstantin Shalygin <k0ste@k0ste.ru>
Wed, 17 Feb 2021 05:29:05 +0000 (12:29 +0700)
Fixes: https://tracker.ceph.com/issues/48201
Signed-off-by: Patrick Seidensal <pseidensal@suse.com>
(cherry picked from commit 9ac248b0c309b95d33bb0b1b64dc2e397bf8508e)

monitoring/prometheus/alerts/ceph_default_alerts.yml

index abc6f647142b58447855ad36c3d8a3e9c526123d..949e606bff0d4c5c1d0a2bd152951b5a992d4740 100644 (file)
@@ -175,30 +175,48 @@ groups:
           description: >
             Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
 
-      # alert on nic packet errors and drops rates > 1 packet/s
+      # alert on nic packet errors and drops rates > 1% packets/s
       - alert: network packets dropped
-        expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
+        expr: |
+          (
+            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) / (
+            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) >= 10
         labels:
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.8.2
         annotations:
           description: >
-            Node {{ $labels.instance }} experiences packet drop > 1
-            packet/s on interface {{ $labels.device }}.
+            Node {{ $labels.instance }} experiences packet drop > 0.01% or >
+            10 packets/s on interface {{ $labels.device }}.
 
       - alert: network packet errors
         expr: |
-          irate(node_network_receive_errs_total{device!="lo"}[5m]) +
-          irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
+          (
+            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) / (
+            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) >= 10
         labels:
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.8.3
         annotations:
           description: >
-            Node {{ $labels.instance }} experiences packet errors > 1
-            packet/s on interface {{ $labels.device }}.
+            Node {{ $labels.instance }} experiences packet errors > 0.01% or
+            > 10 packets/s on interface {{ $labels.device }}.
 
       - alert: storage filling up
         expr: |