From 653c3f66823179fc5b9cbb74ff932d61a6c4178c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Beno=C3=AEt=20Knecht?= Date: Thu, 30 Apr 2020 10:50:07 +0200 Subject: [PATCH] monitoring: Fix "10% OSDs down" alert description MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The alert was triggered when less than 90% of OSDs were _up_, but then the description took that value and described it as the percentage of OSDs being _down_. So with 12% of OSDs down, the alert description would read: ``` 88% or 88 of 100 OSDs are down (>=10%). ``` which can be panic-inducing. This commit changes the alert expression to actually compute the ratio of OSDs being down, which makes the correct value appear in the description. Signed-off-by: Benoît Knecht --- monitoring/prometheus/alerts/ceph_default_alerts.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml index a0074f0779f0c..51d19bfca963e 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml @@ -47,14 +47,14 @@ groups: - name: osd rules: - alert: 10% OSDs down - expr: (sum(ceph_osd_up) / count(ceph_osd_up)) * 100 <= 90 + expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.1 annotations: description: | - {{ $value | humanize}}% or {{with query "sum(ceph_osd_up)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)"}}{{. | first | value }}{{ end }} OSDs are down (>=10%). + {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%). The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} -- 2.39.5