From 8e6838c740180d4a0e9dfa59b99f832d88694405 Mon Sep 17 00:00:00 2001 From: Volker Theile Date: Mon, 6 May 2019 16:26:37 +0200 Subject: [PATCH] monitoring: SNMP OID per every Prometheus alert rule Use the Ceph enterprise OID 50495 (https://www.iana.org/assignments/enterprise-numbers/enterprise-numbers) and create OIDs for every Prometheus alert rule according to the schema at https://github.com/SUSE/prometheus-webhook-snmp/blob/master/README.md. Example OID: 1.3.6.1.4.1.50495.15.1.2.2.1 All alert rule OIDs are located below the object identifier 15 (15 for p which is the first character of prometheus). Check out the MIB at https://github.com/SUSE/prometheus-webhook-snmp/blob/master/PROMETHEUS-ALERT-CEPH-MIB.txt for more details. Signed-off-by: Volker Theile --- .../prometheus/alerts/ceph_default_alerts.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml index 310be58fa71..2f4560e0346 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml @@ -7,14 +7,16 @@ groups: labels: severity: critical type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.2.1 annotations: - description: Ceph in health_error state for more than 5m + description: Ceph in health_error state for more than 5m. - alert: health warn expr: ceph_health_status == 1 for: 15m labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.2.2 annotations: description: Ceph in health_warn for more than 15m. - name: mon @@ -24,6 +26,7 @@ groups: labels: severity: critical type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.3.1 annotations: description: Monitor count in quorum is low. - name: osd @@ -33,6 +36,7 @@ groups: labels: severity: critical type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.4.1 annotations: description: More than 10% of OSDs are down. - alert: OSD down @@ -41,6 +45,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.4.2 annotations: description: One or more OSDs down for more than 15 minutes. - alert: OSDs near full @@ -48,6 +53,7 @@ groups: labels: severity: critical type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.4.3 annotations: description: OSD {{ $labels.ceph_daemon }} is dangerously full, over 80%. # alert on single OSDs flapping @@ -56,6 +62,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.4.4 annotations: description: > OSD {{ $labels.ceph_daemon }} was marked down and back up at least once a @@ -67,6 +74,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.4.5 annotations: description: > OSD {{ $labels.ceph_daemon }} deviates by more than 30% from @@ -86,6 +94,7 @@ groups: labels: severity: critical type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.7.1 annotations: description: One or more PGs are inactive for more than 5 minutes. - alert: pgs unclean @@ -94,6 +103,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.7.2 annotations: description: One or more PGs are not clean for more than 15 minutes. - name: nodes @@ -103,6 +113,7 @@ groups: labels: severity: critical type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.8.1 annotations: description: Root volume (OSD and MON store) is dangerously full (< 5% free). # alert on nic packet errors and drops rates > 1 packet/s @@ -111,6 +122,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.8.2 annotations: description: > Node {{ $labels.instance }} experiences packet drop > 1 @@ -120,6 +132,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.8.3 annotations: description: > Node {{ $labels.instance }} experiences packet errors > 1 @@ -130,6 +143,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.8.4 annotations: description: > Mountpoint {{ $labels.mountpoint }} will be full in less than 5 days @@ -141,6 +155,7 @@ groups: labels: severity: critical type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.9.1 annotations: description: Pool {{ $labels.name }} at 90% capacity or over. - alert: pool filling up @@ -148,6 +163,7 @@ groups: labels: severity: warning type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.9.2 annotations: description: > Pool {{ $labels.name }} will be full in less than 5 days -- 2.39.5