monitoring: add a few prometheus alerts

author Jan Fajerski <jfajerski@suse.com>

Mon, 15 Apr 2019 13:35:09 +0000 (15:35 +0200)

committer Nathan Cutler <ncutler@suse.com>

Mon, 20 May 2019 10:29:50 +0000 (12:29 +0200)
author Jan Fajerski <jfajerski@suse.com>
Mon, 15 Apr 2019 13:35:09 +0000 (15:35 +0200)
committer Nathan Cutler <ncutler@suse.com>
Mon, 20 May 2019 10:29:50 +0000 (12:29 +0200)
diff --git a/monitoring/prometheus/README.md b/monitoring/prometheus/README.md

new file mode 100644 (file)

index 0000000..fde63a3
--- /dev/null
+++ b/monitoring/prometheus/README.md
@@ -0,0 +1,7 @@
+## Prometheus related bits
+
+### Alerts
+In monitoring/prometheus/alerts you'll find a set of Prometheus alert rules that
+should provide a decent set of default alerts for a Ceph cluster. Just put this
+file in a place according to your Prometheus configuration (wherever the `rules`
+configuration stanza points).
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml

new file mode 100644 (file)

index 0000000..310be58
--- /dev/null
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -0,0 +1,154 @@
+groups:
+  - name: cluster health
+    rules:
+      - alert: health error
+        expr: ceph_health_status == 2
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: Ceph in health_error state for more than 5m
+      - alert: health warn
+        expr: ceph_health_status == 1
+        for: 15m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: Ceph in health_warn for more than 15m.
+  - name: mon
+    rules:
+      - alert: low monitor quorum count
+        expr: sum(ceph_mon_quorum_status) < 3
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: Monitor count in quorum is low.
+  - name: osd
+    rules:
+      - alert: 10% OSDs down
+        expr: sum(ceph_osd_up) / count(ceph_osd_in) <= 0.9
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: More than 10% of OSDs are down.
+      - alert: OSD down
+        expr: count(ceph_osd_up == 0) > 0
+        for: 15m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: One or more OSDs down for more than 15 minutes.
+      - alert: OSDs near full
+        expr: ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1) > 0.8
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: OSD {{ $labels.ceph_daemon }} is dangerously full, over 80%.
+      # alert on single OSDs flapping
+      - alert: flap osd
+        expr: rate(ceph_osd_up[5m])*60 > 1
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+              OSD {{ $labels.ceph_daemon }} was marked down and back up at least once a
+              minute for 5 minutes.
+      # alert on high deviation from average PG count
+      - alert: high pg count deviation
+        expr: abs(((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)) > 0.35
+        for: 5m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+              OSD {{ $labels.ceph_daemon }} deviates by more than 30% from
+              average PG count.
+      # alert on high commit latency...but how high is too high
+  - name: mds
+    rules:
+    # no mds metrics are exported yet
+  - name: mgr
+    rules:
+    # no mgr metrics are exported yet
+  - name: pgs
+    rules:
+      - alert: pgs inactive
+        expr: ceph_pg_total - ceph_pg_active > 0
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: One or more PGs are inactive for more than 5 minutes.
+      - alert: pgs unclean
+        expr: ceph_pg_total - ceph_pg_clean > 0
+        for: 15m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: One or more PGs are not clean for more than 15 minutes.
+  - name: nodes
+    rules:
+      - alert: root volume full
+        expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} < 0.05
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: Root volume (OSD and MON store) is dangerously full (< 5% free).
+      # alert on nic packet errors and drops rates > 1 packet/s
+      - alert: network packets dropped
+        expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Node {{ $labels.instance }} experiences packet drop > 1
+            packet/s on interface {{ $labels.device }}.
+      - alert: network packet errors
+        expr: irate(node_network_receive_errs_total{device!="lo"}[5m]) + irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Node {{ $labels.instance }} experiences packet errors > 1
+            packet/s on interface {{ $labels.device }}.
+      # predict fs fillup times
+      - alert: storage filling
+        expr: ((node_filesystem_free_bytes) / deriv(node_filesystem_free_bytes[2d]) <= 5) > 0
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Mountpoint {{ $labels.mountpoint }} will be full in less than 5 days
+            assuming the average fillup rate of the past 48 hours.
+  - name: pools
+    rules:
+      - alert: pool full
+        expr: ceph_pool_stored / ceph_pool_max_avail * on(pool_id) group_right ceph_pool_metadata > 0.9
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: Pool {{ $labels.name }} at 90% capacity or over.
+      - alert: pool filling up
+        expr: (((ceph_pool_max_avail - ceph_pool_stored) / deriv(ceph_pool_max_avail[2d])) * on(pool_id) group_right ceph_pool_metadata <=5) > 0
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Pool {{ $labels.name }} will be full in less than 5 days
+            assuming the average fillup rate of the past 48 hours.
author	Jan Fajerski <jfajerski@suse.com>
	Mon, 15 Apr 2019 13:35:09 +0000 (15:35 +0200)
committer	Nathan Cutler <ncutler@suse.com>
	Mon, 20 May 2019 10:29:50 +0000 (12:29 +0200)
monitoring/prometheus/README.md	[new file with mode: 0644]	patch \| blob
monitoring/prometheus/alerts/ceph_default_alerts.yml	[new file with mode: 0644]	patch \| blob