dashboard: Add and copy alerting rules

author Boris Ranto <branto@redhat.com>

Fri, 15 Feb 2019 19:27:15 +0000 (20:27 +0100)

committer Guillaume Abrioux <gabrioux@redhat.com>

Fri, 17 May 2019 14:05:58 +0000 (16:05 +0200)
author Boris Ranto <branto@redhat.com>
Fri, 15 Feb 2019 19:27:15 +0000 (20:27 +0100)
committer Guillaume Abrioux <gabrioux@redhat.com>
Fri, 17 May 2019 14:05:58 +0000 (16:05 +0200)
diff --git a/roles/ceph-prometheus/files/ceph_dashboard.yml b/roles/ceph-prometheus/files/ceph_dashboard.yml

new file mode 100644 (file)

index 0000000..aff1b25
--- /dev/null
+++ b/roles/ceph-prometheus/files/ceph_dashboard.yml
@@ -0,0 +1,107 @@
+groups:
+- name: dashboard
+  rules:
+  - alert: Ceph Health Warning
+    expr: ceph_health_status == 1
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Ceph Health Warning"
+      description: "Overall Ceph Health"
+  - alert: Ceph Health Error
+    expr: ceph_health_status > 1
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Ceph Health Error"
+      description: "The Ceph cluster health is in an error state"
+  - alert: Disk(s) Near Full
+    expr: (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) * 100 > 85
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Disk(s) Near Full"
+      description: "This shows how many disks are at or above 85% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's."
+  - alert: OSD(s) Down
+    expr: ceph_osd_up < 0.5
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "OSD(s) Down"
+      description: "This indicates that one or more OSDs is currently marked down in the cluster."
+  - alert: OSD Host(s) Down
+    expr: count by(instance) (ceph_disk_occupation * on(ceph_daemon) group_right(instance) ceph_osd_up == 0) - count by(instance) (ceph_disk_occupation) == 0
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "OSD Host(s) Down"
+      description: "This indicates that one or more OSD hosts is currently down in the cluster."
+  - alert: PG(s) Stuck
+    expr: max(ceph_osd_numpg) > scalar(ceph_pg_active)
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "PG(s) Stuck"
+      description: "This indicates there are pg's in a stuck state, manual intervention needed to resolve."
+  - alert: OSD Host Loss Check
+    expr: max(sum(ceph_osd_stat_bytes - ceph_osd_stat_bytes_used)) * 0.9 < scalar(max(sum by (instance) (ceph_osd_stat_bytes + on (ceph_daemon) group_left (instance) (ceph_disk_occupation*0))))
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "OSD Host Loss Check"
+      description: "This indicates that the cluster @ 90% full is not enough to support the loss of the largest OSD host."
+  - alert: Slow OSD Responses
+    expr: ((irate(node_disk_read_time_seconds_total[5m]) / clamp_min(irate(node_disk_reads_completed_total[5m]), 1) + irate(node_disk_write_time_seconds_total[5m]) / clamp_min(irate(node_disk_writes_completed_total[5m]), 1)) and on (instance, device) ceph_disk_occupation) > 1
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Slow OSD Responses"
+      description: "This indicates that some OSD Latencies are above 1s."
+  - alert: Network Errors
+    expr: sum by (instance, device) (irate(node_network_receive_drop_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m]) + irate(node_network_receive_errs_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m]) + irate(node_network_transmit_drop_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m]) + irate(node_network_transmit_errs_total{device=~"(eth|en|bond|ib|mlx|p).*"}[5m])) > 10
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Network Errors"
+      description: "This indicates that more than 10 dropped/error packets are seen in a 5m interval"
+  - alert: Pool Capacity Low
+    expr: (ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail) * 100 + on (pool_id) group_left (name) (ceph_pool_metadata*0)) > 85
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Pool Capacity Low"
+      description: "This indicates a low capacity in a pool."
+  - alert: MON(s) Down
+    expr: ceph_mon_quorum_status != 1
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "MON(s) down"
+      description: "This indicates that one or more MON(s) is down."
+  - alert: Cluster Capacity Low
+    expr: sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "Cluster Capacity Low"
+      description: "This indicates raw used space crosses the 85% capacity threshold of the ceph cluster."
+  - alert: OSD(s) with High PG Count
+    expr: ceph_osd_numpg > 275
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      summary: "OSD(s) with High PG Count"
+      description: "This indicates there are some OSDs with high PG count (275+)."
diff --git a/roles/ceph-prometheus/tasks/main.yml b/roles/ceph-prometheus/tasks/main.yml

index aaa03099c671dfd5aa267214c226a833334ef395..39f15008e7b8a3842feae70251feed3ef202587f 100644 (file)
--- a/roles/ceph-prometheus/tasks/main.yml
+++ b/roles/ceph-prometheus/tasks/main.yml
@@ -15,6 +15,20 @@
      owner: "{{ prometheus_user_id }}"
    notify: service handler
  
+- name: make sure the alerting rules directory exists
+  file:
+    path: "/etc/prometheus/alerting/"
+    state: directory
+    recurse: yes
+
+- name: copy alerting rules
+  copy:
+    src: "ceph_dashboard.yml"
+    dest: "/etc/prometheus/alerting/ceph_dashboard.yml"
+    owner: root
+    group: root
+    mode: 0644
+
  - name: create alertmanager directories
    file:
      path: "{{ item }}"
diff --git a/roles/ceph-prometheus/templates/prometheus.yml b/roles/ceph-prometheus/templates/prometheus.yml

index 860eb5e6c1bc37718aa80dfc0aa026baed707687..70262e4edfff45acacff34855b7e4d865f7c7f3c 100644 (file)
--- a/roles/ceph-prometheus/templates/prometheus.yml
+++ b/roles/ceph-prometheus/templates/prometheus.yml
@@ -3,7 +3,7 @@ global:
    evaluation_interval: 15s
  
  rule_files:
-  - '/etc/prometheus/alerts/*'
+  - '/etc/prometheus/alerting/*'
  
  scrape_configs:
    - job_name: 'prometheus'
author	Boris Ranto <branto@redhat.com>
	Fri, 15 Feb 2019 19:27:15 +0000 (20:27 +0100)
committer	Guillaume Abrioux <gabrioux@redhat.com>
	Fri, 17 May 2019 14:05:58 +0000 (16:05 +0200)
roles/ceph-prometheus/files/ceph_dashboard.yml	[new file with mode: 0644]	patch \| blob
roles/ceph-prometheus/tasks/main.yml		patch \| blob \| history
roles/ceph-prometheus/templates/prometheus.yml		patch \| blob \| history