monitoring: SNMP OID per every Prometheus alert rule

author Volker Theile <vtheile@suse.com>

Mon, 6 May 2019 14:26:37 +0000 (16:26 +0200)

committer Volker Theile <vtheile@suse.com>

Tue, 28 May 2019 07:59:50 +0000 (09:59 +0200)
author Volker Theile <vtheile@suse.com>
Mon, 6 May 2019 14:26:37 +0000 (16:26 +0200)
committer Volker Theile <vtheile@suse.com>
Tue, 28 May 2019 07:59:50 +0000 (09:59 +0200)
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml

index 310be58fa710fbb0a64b1e3b3fa893e3dfaf3f1f..2f4560e0346bfa8578dbf973344110e24e904365 100644 (file)
--- a/monitoring/prometheus/alerts/ceph_default_alerts.yml
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -7,14 +7,16 @@ groups:
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.2.1
          annotations:
-          description: Ceph in health_error state for more than 5m
+          description: Ceph in health_error state for more than 5m.
        - alert: health warn
          expr: ceph_health_status == 1
          for: 15m
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.2.2
          annotations:
            description: Ceph in health_warn for more than 15m.
    - name: mon
@@ -24,6 +26,7 @@ groups:
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.3.1
          annotations:
            description: Monitor count in quorum is low.
    - name: osd
@@ -33,6 +36,7 @@ groups:
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.4.1
          annotations:
            description: More than 10% of OSDs are down.
        - alert: OSD down
@@ -41,6 +45,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.4.2
          annotations:
            description: One or more OSDs down for more than 15 minutes.
        - alert: OSDs near full
@@ -48,6 +53,7 @@ groups:
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.4.3
          annotations:
            description: OSD {{ $labels.ceph_daemon }} is dangerously full, over 80%.
        # alert on single OSDs flapping
@@ -56,6 +62,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.4.4
          annotations:
            description: >
                OSD {{ $labels.ceph_daemon }} was marked down and back up at least once a
@@ -67,6 +74,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.4.5
          annotations:
            description: >
                OSD {{ $labels.ceph_daemon }} deviates by more than 30% from
@@ -86,6 +94,7 @@ groups:
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.7.1
          annotations:
            description: One or more PGs are inactive for more than 5 minutes.
        - alert: pgs unclean
@@ -94,6 +103,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.7.2
          annotations:
            description: One or more PGs are not clean for more than 15 minutes.
    - name: nodes
@@ -103,6 +113,7 @@ groups:
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.8.1
          annotations:
            description: Root volume (OSD and MON store) is dangerously full (< 5% free).
        # alert on nic packet errors and drops rates > 1 packet/s
@@ -111,6 +122,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.8.2
          annotations:
            description: >
              Node {{ $labels.instance }} experiences packet drop > 1
@@ -120,6 +132,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.8.3
          annotations:
            description: >
              Node {{ $labels.instance }} experiences packet errors > 1
@@ -130,6 +143,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.8.4
          annotations:
            description: >
              Mountpoint {{ $labels.mountpoint }} will be full in less than 5 days
@@ -141,6 +155,7 @@ groups:
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.9.1
          annotations:
            description: Pool {{ $labels.name }} at 90% capacity or over.
        - alert: pool filling up
@@ -148,6 +163,7 @@ groups:
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.9.2
          annotations:
            description: >
              Pool {{ $labels.name }} will be full in less than 5 days
author	Volker Theile <vtheile@suse.com>
	Mon, 6 May 2019 14:26:37 +0000 (16:26 +0200)
committer	Volker Theile <vtheile@suse.com>
	Tue, 28 May 2019 07:59:50 +0000 (09:59 +0200)