monitoring: Fix "10% OSDs down" alert description

author Benoît Knecht <bknecht@protonmail.ch>

Thu, 30 Apr 2020 08:50:07 +0000 (10:50 +0200)

committer Benoît Knecht <bknecht@protonmail.ch>

Wed, 6 May 2020 16:49:26 +0000 (18:49 +0200)
author Benoît Knecht <bknecht@protonmail.ch>
Thu, 30 Apr 2020 08:50:07 +0000 (10:50 +0200)
committer Benoît Knecht <bknecht@protonmail.ch>
Wed, 6 May 2020 16:49:26 +0000 (18:49 +0200)
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml

index a0074f0779f0c1c691b5b0b6e723ea3bebcaf338..51d19bfca963e9c94e413ff82977b9b67cc8e716 100644 (file)
--- a/monitoring/prometheus/alerts/ceph_default_alerts.yml
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -47,14 +47,14 @@ groups:
    - name: osd
      rules:
        - alert: 10% OSDs down
-        expr: (sum(ceph_osd_up) / count(ceph_osd_up)) * 100 <= 90
+        expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
          labels:
            severity: critical
            type: ceph_default
            oid: 1.3.6.1.4.1.50495.15.1.2.4.1
          annotations:
            description: |
-            {{ $value | humanize}}% or {{with query "sum(ceph_osd_up)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)"}}{{. | first | value }}{{ end }} OSDs are down (>=10%).
+            {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
  
              The following OSDs are down:
              {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
author	Benoît Knecht <bknecht@protonmail.ch>
	Thu, 30 Apr 2020 08:50:07 +0000 (10:50 +0200)
committer	Benoît Knecht <bknecht@protonmail.ch>
	Wed, 6 May 2020 16:49:26 +0000 (18:49 +0200)