From 435fe885e0b5a25b8e0c336a5fe5a57c88d14ebc Mon Sep 17 00:00:00 2001 From: Patrick Seidensal Date: Wed, 11 Nov 2020 18:55:30 +0100 Subject: [PATCH] mgr/dashboard: prometheus alerting: add some leeway for package drops and errors (1%) Fixes: https://tracker.ceph.com/issues/48201 Signed-off-by: Patrick Seidensal (cherry picked from commit 9ac248b0c309b95d33bb0b1b64dc2e397bf8508e) --- .../prometheus/alerts/ceph_default_alerts.yml | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml index 92eb391677583..ced497d5c209e 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml @@ -164,28 +164,46 @@ groups: description: > Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free. - # alert on nic packet errors and drops rates > 1 packet/s + # alert on nic packet errors and drops rates > 1% packets/s - alert: network packets dropped - expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1 + expr: | + ( + increase(node_network_receive_drop_total{device!="lo"}[1m]) + + increase(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + increase(node_network_receive_packets_total{device!="lo"}[1m]) + + increase(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + increase(node_network_receive_drop_total{device!="lo"}[1m]) + + increase(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 labels: severity: warning type: ceph_default annotations: description: > - Node {{ $labels.instance }} experiences packet drop > 1 - packet/s on interface {{ $labels.device }}. + Node {{ $labels.instance }} experiences packet drop > 0.01% or > + 10 packets/s on interface {{ $labels.device }}. - alert: network packet errors expr: | - irate(node_network_receive_errs_total{device!="lo"}[5m]) + - irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1 + ( + increase(node_network_receive_errs_total{device!="lo"}[1m]) + + increase(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + increase(node_network_receive_packets_total{device!="lo"}[1m]) + + increase(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + increase(node_network_receive_errs_total{device!="lo"}[1m]) + + increase(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 labels: severity: warning type: ceph_default annotations: description: > - Node {{ $labels.instance }} experiences packet errors > 1 - packet/s on interface {{ $labels.device }}. + Node {{ $labels.instance }} experiences packet errors > 0.01% or + > 10 packets/s on interface {{ $labels.device }}. - alert: storage filling up expr: | -- 2.39.5