mgr/prometheus: Update rule format and enhance SNMP support

author Paul Cuzner <pcuzner@redhat.com>

Wed, 3 Nov 2021 02:24:20 +0000 (15:24 +1300)

committer Paul Cuzner <pcuzner@redhat.com>

Thu, 4 Nov 2021 22:24:25 +0000 (11:24 +1300)
author Paul Cuzner <pcuzner@redhat.com>
Wed, 3 Nov 2021 02:24:20 +0000 (15:24 +1300)
committer Paul Cuzner <pcuzner@redhat.com>
Thu, 4 Nov 2021 22:24:25 +0000 (11:24 +1300)
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml

index 4b9ea51ebe1d86fa83af5e13aac4b2d44b6edc65..d9e6e35637f92deb1b0da841a05b0438b71bb44b 100644 (file)
--- a/monitoring/prometheus/alerts/ceph_default_alerts.yml
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -1,41 +1,43 @@
  groups:
    - name: cluster health
      rules:
-      - alert: health error
+      - alert: CephHealthError
          expr: ceph_health_status == 2
          for: 5m
          labels:
            severity: critical
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.2.1
+          oid: 1.3.6.1.4.1.50495.1.2.1.2.1
          annotations:
+          summary: Cluster is in an ERROR state
            description: >
              Ceph in HEALTH_ERROR state for more than 5 minutes.
              Please check "ceph health detail" for more information.
  
-      - alert: health warn
+      - alert: CephHealthWarning
          expr: ceph_health_status == 1
          for: 15m
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.2.2
          annotations:
+          summary: Cluster is in a WARNING state
            description: >
              Ceph has been in HEALTH_WARN for more than 15 minutes.
              Please check "ceph health detail" for more information.
  
    - name: mon
      rules:
-      - alert: Monitor down, quorum is at risk
+      - alert: CephMonDownQuorumAtRisk
          expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
          for: 30s
          labels:
            severity: critical
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.3.1
+          oid: 1.3.6.1.4.1.50495.1.2.1.3.1
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
+          summary: Monitor quorum is at risk
            description: |
              {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active
              Without quorum the cluster will become inoperable, affecting all connected clients and services.
@@ -44,7 +46,7 @@ groups:
              {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
                - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
              {{- end }}
-      - alert: Monitor down
+      - alert: CephMonDown
          expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
          for: 30s
          labels:
@@ -52,6 +54,7 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
+          summary: One of more ceph monitors are down
            description: |
              {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down.
              Quorum is still intact, but the loss of further monitors will make your cluster inoperable.
@@ -60,14 +63,16 @@ groups:
              {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
                - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
              {{- end }}
-      - alert: Ceph mon disk space critically low
+      - alert: CephMonDiskspaceCritical
          expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.3.2
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
+          summary: Disk space on at least one monitor is critically low
            description: |
              The free space available to a monitor's store is critically low (<5% by default).
              You should increase the space available to the monitor(s). The
@@ -76,7 +81,7 @@ groups:
                - {{ .Labels.hostname }}
              {{- end }}
  
-      - alert: Ceph mon disk space running low
+      - alert: CephMonDiskspaceLow
          expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
          for: 5m
          labels:
@@ -84,6 +89,7 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
+          summary: Disk space on at least one monitor is approaching full
            description: |
              The space available to a monitor's store is approaching full (>70% is the default).
              You should increase the space available to the monitor store. The
@@ -92,7 +98,7 @@ groups:
                - {{ .Labels.hostname }}
              {{- end }}
  
-      - alert: Clock skew detected across Ceph Monitor daemons
+      - alert: CephMonClockSkew
          expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
          for: 1m
          labels:
@@ -100,6 +106,7 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
+          summary: Clock skew across the Monitor hosts detected
            description: |
              The ceph monitors rely on a consistent time reference to maintain
              quorum and cluster consistency. This event indicates that at least
@@ -110,41 +117,45 @@ groups:
  
    - name: osd
      rules:
-      - alert: 10% OSDs down
+      - alert: CephOSDDownHigh
          expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
          labels:
            severity: critical
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.4.1
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.1
          annotations:
+          summary: More than 10% of OSDs are down
            description: |
-            {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
+            {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%).
  
              The following OSDs are down:
              {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
                - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
              {{- end }}
-      - alert: OSD Host is down
+      - alert: CephOSDHostDown
          expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
          for: 5m
          labels:
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.8
          annotations:
+          summary: An OSD host is offline
            description: |
              The following OSDs are down:
              {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
              - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }}
              {{- end }}
-      - alert: OSD down
+      - alert: CephOSDDown
          expr: ceph_health_detail{name="OSD_DOWN"} == 1
          for: 5m
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.4.2
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.2
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
+          summary: An OSD has been marked down/unavailable
            description: |
              {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins.
  
@@ -153,34 +164,37 @@ groups:
                - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
                {{- end }}
  
-      - alert: OSDs near full
+      - alert: CephOSDNearFull
          expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
          for: 5m
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.4.3
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.3
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
+          summary: OSD(s) running low on free space (NEARFULL)
            description: |
              One or more OSDs have reached their NEARFULL threshold
  
              Use 'ceph health detail' to identify which OSDs have reached this threshold.
              To resolve, either add capacity to the cluster, or delete unwanted data
-      - alert: OSD Full
+      - alert: CephOSDFull
          expr: ceph_health_detail{name="OSD_FULL"} > 0
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.6
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
+          summary: OSD(s) is full, writes blocked
            description: |
              An OSD has reached it's full threshold. Writes from all pools that share the
              affected OSD will be blocked.
  
              To resolve, either add capacity to the cluster, or delete unwanted data
-      - alert: OSD unable to perform rebalance
+      - alert: CephOSDBackfillFull
          expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
          for: 1m
          labels:
@@ -188,12 +202,13 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
+          summary: OSD(s) too full for backfill operations
            description: |
              An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations
              completing for some pools. Check the current capacity utilisation with 'ceph df'
  
              To resolve, either add capacity to the cluster, or delete unwanted data
-      - alert: OSD too many read repairs
+      - alert: CephOSDTooManyRepairs
          expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
          for: 30s
          labels:
@@ -201,30 +216,33 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
+          summary: OSD has hit a high number of read errors
            description: |
              Reads from an OSD have used a secondary PG to return data to the client, indicating
              a potential failing disk.
-      - alert: OSD hearbeats running slow (frontend)
+      - alert: CephOSDTimeoutsPublicNetwork
          expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
          for: 1m
          labels:
            severity: warning
            type: ceph_default
          annotations:
+          summary: Network issues delaying OSD heartbeats (public network)
            description: |
              OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
              for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
-      - alert: OSD hearbeats running slow (backend)
+      - alert: CephOSDTimeoutsClusterNetwork
          expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
          for: 1m
          labels:
            severity: warning
            type: ceph_default
          annotations:
+          summary: Network issues delaying OSD heartbeats (cluster network)
            description: |
              OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
              for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
-      - alert: OSD disk size mismatch
+      - alert: CephOSDInternalDiskSizeMismatch
          expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
          for: 1m
          labels:
@@ -232,10 +250,11 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
+          summary: OSD size inconsistency error
            description: |
              One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata.
              This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs.
-      - alert: Device failure predicted
+      - alert: CephDeviceFailurePredicted
          expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
          for: 1m
          labels:
@@ -243,6 +262,7 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
+          summary: Device(s) have been predicted to fail soon
            description: |
              The device health module has determined that one or more devices will fail
              soon. To review the device states use 'ceph device ls'. To show a specific
@@ -250,20 +270,22 @@ groups:
  
              Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once
              the osd is empty remove and replace the OSD.
-      - alert: Too many devices predicted to fail
+      - alert: CephDeviceFailurePredictionTooHigh
          expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.7
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
+          summary: Too many devices have been predicted to fail, unable to resolve
            description: |
              The device health module has determined that the number of devices predicted to
              fail can not be remediated automatically, since it would take too many osd's out of
              the cluster, impacting performance and potentially availabililty. You should add new
              OSDs to the cluster to allow data to be relocated to avoid the data integrity issues.
-      - alert: Device failure predicted, but automatic drain is incomplete
+      - alert: CephDeviceFailureRelocationIncomplete
          expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
          for: 1m
          labels:
@@ -271,6 +293,7 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
+          summary: A device failure is predicted, but unable to relocate data
            description: |
              The device health module has determined that one or more devices will fail
              soon, but the normal process of relocating the data on the device to other
@@ -280,7 +303,7 @@ groups:
              more disks to the cluster to allow the data from the failing device to
              successfully migrate.
  
-      - alert: Flapping OSD
+      - alert: CephOSDFlapping
          expr: |
            (
              rate(ceph_osd_up[5m])
@@ -289,9 +312,10 @@ groups:
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.4.4
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.4
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
+          summary: Network issues are causing OSD's to flap (mark each other out)
            description: >
              OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
              marked down and back up at {{ $value | humanize }} times once a
@@ -299,7 +323,7 @@ groups:
              packet drop, disruption) on the clusters "cluster network". Check the
              network environment on the listed host(s).
  
-      - alert: OSD Read errors
+      - alert: CephOSDReadErrors
          expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
          for: 30s
          labels:
@@ -307,11 +331,12 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
+          summary: Device read errors detected
            description: >
              An OSD has encountered read errors, but the OSD has recovered by retrying
              the reads. This may indicate an issue with the Hardware or Kernel.
        # alert on high deviation from average PG count
-      - alert: high pg count deviation
+      - alert: CephPGImbalance
          expr: |
            abs(
              (
@@ -322,8 +347,9 @@ groups:
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.4.5
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.5
          annotations:
+          summary: PG allocations are not balanced across devices
            description: >
              OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
              by more than 30% from average PG count.
@@ -331,28 +357,99 @@ groups:
  
    - name: mds
      rules:
-      - alert: Ceph Filesystem damage detected
+      - alert: CephFilesystemDamaged
          expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.1
          annotations:
            documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
+          summary: Ceph filesystem is damaged.
            description: >
              The filesystems metadata has been corrupted. Data access
              may be blocked.
  
              Either analyse the output from the mds daemon admin socket, or
              escalate to support
-      - alert: Ceph Filesystem switched to READ ONLY
+      - alert: CephFilesystemOffline
+        expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.3
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
+          summary: Ceph filesystem is offline
+          description: >
+            All MDS ranks are unavailable. The ceph daemons providing the metadata
+            for the Ceph filesystem are all down, rendering the filesystem offline.
+      - alert: CephFilesystemDegraded
+        expr: ceph_health_detail{name="FS_DEGRADED"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.4
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
+          summary: Ceph filesystem is degraded
+          description: >
+            One or more metdata daemons (MDS ranks) are failed or in a
+            damaged state. At best the filesystem is partially available,
+            worst case is the filesystem is completely unusable.
+      - alert: CephFilesystemMDSRanksLow
+        expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
+          summary: Ceph MDS daemon count is lower than configured
+          description: >
+            The filesystem's "max_mds" setting defined the number of MDS ranks in
+            the filesystem. The current number of active MDS daemons is less than
+            this setting.
+      - alert: CephFilesystemInsufficientStandby
+        expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
+          summary: Ceph filesystem standby daemons too low
+          description: >
+            The minimum number of standby daemons determined by standby_count_wanted
+            is less than the actual number of standby daemons. Adjust the standby count
+            or increase the number of mds daemons within the filesystem.
+      - alert: CephFilesystemFailureNoStandby
+        expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.5
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
+          summary: Ceph MDS daemon failed, no further standby available
+          description: >
+            An MDS daemon has failed, leaving only one active rank without
+            further standby. Investigate the cause of the failure or add a
+            standby daemon
+      - alert: CephFilesystemReadOnly
          expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.2
          annotations:
            documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
+          summary: Ceph filesystem in read only mode, due to write error(s)
            description: >
              The filesystem has switched to READ ONLY due to an unexpected
              write error, when writing to the metadata pool
@@ -362,25 +459,29 @@ groups:
  
    - name: mgr
      rules:
-      - alert: mgr module failure
+      - alert: CephMgrModuleCrash
          expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
          for: 5m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.6.1
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
+          summary: A mgr module has recently crashed
            description: >
              One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A
              crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to
              investigate which module has failed, and archive it to acknowledge the failure.
-      - alert: mgr prometheus module is not active
+      - alert: CephMgrPrometheusModuleInactive
          expr: up{job="ceph"} == 0
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.6.2
          annotations:
+          summary: Ceph's mgr/prometheus module is not available
            description: >
              The mgr/prometheus module at {{ $labels.instance }} is unreachable. This
              could mean that the module has been disabled or the mgr itself is down.
@@ -393,37 +494,41 @@ groups:
  
    - name: pgs
      rules:
-      - alert: pgs inactive
+      - alert: CephPGsInactive
          expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
          for: 5m
          labels:
            severity: critical
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.7.1
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.1
          annotations:
+          summary: One or more Placement Groups are inactive
            description: >
              {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
              Inactive placement groups aren't able to serve read/write
              requests.
-      - alert: pgs unclean
+      - alert: CephPGsUnclean
          expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
          for: 15m
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.7.2
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.2
          annotations:
+          summary: One or more platcment groups are marked unclean
            description: >
              {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
              Unclean PGs haven't been able to completely recover from a previous failure.
-      - alert: Placement Group (PG) damaged
+      - alert: CephPGsDamaged
          expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
          for: 5m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.4
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
+          summary: Placement group damaged, manual intervention needed
            description: >
              During data consistency checks (scrub), at least one PG has been flagged as being
              damaged or inconsistent.
@@ -431,41 +536,47 @@ groups:
              Check to see which PG is affected, and attempt a manual repair if neccessary. To list
              problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use
              the 'ceph pg repair <pg_num>' command.
-      - alert: Recovery at risk, cluster too full
+      - alert: CephPGRecoveryAtRisk
          expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.5
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
+          summary: OSDs are too full for automatic recovery
            description: >
              Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their
              'full' threshold. Add more capacity to the cluster, or delete unwanted data.
-      - alert: I/O blocked to some data
+      - alert: CephPGUnavilableBlockingIO
          # PG_AVAILABILITY, but an OSD is not in a DOWN state
          expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.3
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
+          summary: Placement group is unavailable, blocking some I/O
            description: >
              Data availability is reduced impacting the clusters abilty to service I/O to some data. One or
              more placement groups (PGs) are in a state that blocks IO.
-      - alert: Cluster too full, automatic data recovery impaired
+      - alert: CephPGBackfillAtRisk
          expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.6
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
+          summary: Backfill operations are blocked, due to lack of freespace
            description: >
              Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
              have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
-      - alert: Placement Group(s) have not been scrubbed
+      - alert: CephPGNotScrubbed
          expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
          for: 5m
          labels:
@@ -473,6 +584,7 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
+          summary: Placement group(s) have not been scrubbed
            description: |
              One or more PGs have not been scrubbed recently. The scrub process is a data integrity
              feature, protectng against bit-rot. It checks that objects and their metadata (size and
@@ -481,7 +593,23 @@ groups:
              scrub window.
  
              You can manually initiate a scrub with: ceph pg scrub <pgid>
-      - alert: Placement Group(s) have not been 'DEEP' scrubbed
+      - alert: CephPGsHighPerOSD
+        expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
+          summary: Placement groups per OSD is too high
+          description: |
+            The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).
+
+            Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status'
+            and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide
+            the autoscaler based on the expected relative size of the pool
+            (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1')
+      - alert: CephPGNotDeepScrubbed
          expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
          for: 5m
          labels:
@@ -489,6 +617,7 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
+          summary: Placement group(s) have not been deep scrubbed
            description: |
              One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity
              feature, protectng against bit-rot. It compares the contents of objects and their
@@ -500,19 +629,20 @@ groups:
  
    - name: nodes
      rules:
-      - alert: root volume full
+      - alert: CephNodeRootFilesystemFull
          expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
          for: 5m
          labels:
            severity: critical
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.8.1
+          oid: 1.3.6.1.4.1.50495.1.2.1.8.1
          annotations:
+          summary: Root filesystem is dangerously full
            description: >
              Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
  
        # alert on nic packet errors and drops rates > 1% packets/s
-      - alert: network packets dropped
+      - alert: CephNodeNetworkPacketDrops
          expr: |
            (
              increase(node_network_receive_drop_total{device!="lo"}[1m]) +
@@ -527,13 +657,14 @@ groups:
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.8.2
+          oid: 1.3.6.1.4.1.50495.1.2.1.8.2
          annotations:
+          summary: One or more Nics is seeing packet drops
            description: >
              Node {{ $labels.instance }} experiences packet drop > 0.01% or >
              10 packets/s on interface {{ $labels.device }}.
  
-      - alert: network packet errors
+      - alert: CephNodeNetworkPacketErrors
          expr: |
            (
              increase(node_network_receive_errs_total{device!="lo"}[1m]) +
@@ -548,102 +679,96 @@ groups:
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.8.3
+          oid: 1.3.6.1.4.1.50495.1.2.1.8.3
          annotations:
+          summary: One or more Nics is seeing packet errors
            description: >
              Node {{ $labels.instance }} experiences packet errors > 0.01% or
              > 10 packets/s on interface {{ $labels.device }}.
  
        # Restrict to device names beginning with '/' to skip false alarms from
        # tmpfs, overlay type filesystems
-      - alert: storage filling up
+      - alert: CephNodeDiskspaceWarning
          expr: |
            predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
            on(instance) group_left(nodename) node_uname_info < 0
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.8.4
+          oid: 1.3.6.1.4.1.50495.1.2.1.8.4
          annotations:
+          summary: Host filesystem freespace is getting low
            description: >
              Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
              will be full in less than 5 days assuming the average fill-up
              rate of the past 48 hours.
  
-      - alert: MTU Mismatch
+      - alert: CephNodeInconsistentMTU
          expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.8.5
          annotations:
+          summary: MTU settings across Ceph hosts are inconsistent
            description: >
              Node {{ $labels.instance }} has a different MTU size ({{ $value }})
              than the median value on device {{ $labels.device }}.
  
    - name: pools
      rules:
-      - alert: pool full
+      - alert: CephPoolGrowthWarning
          expr: |
-          ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
-          * on(pool_id) group_right ceph_pool_metadata * 100 > 90
-        labels:
-          severity: critical
-          type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.9.1
-        annotations:
-          description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
-
-      - alert: pool filling up (growth forecast)
-        expr: |
-          (
-            predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
-            >= ceph_pool_stored + ceph_pool_max_avail
-          ) * on(pool_id) group_left(name) ceph_pool_metadata
+          (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
+              group_right ceph_pool_metadata) >= 95
          labels:
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.9.2
+          oid: 1.3.6.1.4.1.50495.1.2.1.9.2
          annotations:
+          summary: Pool growth rate may soon exceed it's capacity
            description: >
-            Pool {{ $labels.name }} will be full in less than 5 days
+            Pool '{{ $labels.name }}' will be full in less than 5 days
              assuming the average fill-up rate of the past 48 hours.
-
-      - alert: Ceph pool is too full for recovery/rebalance
+      - alert: CephPoolBackfillFull
          expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
          labels:
            severity: warning
            type: ceph_default
          annotations:
+          summary: Freespace in a pool is too low for recovery/rebalance
            description: >
              A pool is approaching it's near full threshold, which will
              prevent rebalance operations from completing. You should
              consider adding more capacity to the pool.
  
-      - alert: Ceph pool is full - writes blocked
+      - alert: CephPoolFull
          expr: ceph_health_detail{name="POOL_FULL"} > 0
          for: 1m
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.9.1
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
+          summary: Pool is full - writes are blocked
            description: |
              A pool has reached it's MAX quota, or the OSDs supporting the pool
              have reached their FULL threshold. Until this is resolved, writes to
              the pool will be blocked.
-
-            Determine the affected pool with 'ceph df detail', for example looking
-            at QUOTA BYTES and STORED. Either increase the pools quota, or add
-            capacity to the cluster first then increase it's quota
-            (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
-      - alert: Ceph pool is approaching full
+            Pool Breakdown (top 5)
+            {{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }}
+              - {{ .Labels.name }} at {{ .Value }}%
+            {{- end }}
+            Either increase the pools quota, or add capacity to the cluster first
+            then increase it's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
+      - alert: CephPoolNearFull
          expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
          for: 5m
          labels:
            severity: warning
            type: ceph_default
          annotations:
+          summary: One or more Ceph pools are getting full
            description: |
              A pool has exceeeded it warning (percent full) threshold, or the OSDs
              supporting the pool have reached their NEARFULL thresholds. Writes may
@@ -656,7 +781,7 @@ groups:
              (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
    - name: healthchecks
      rules:
-      - alert: Slow OSD Ops
+      - alert: CephSlowOps
          expr: ceph_healthcheck_slow_ops > 0
          for: 30s
          labels:
@@ -664,35 +789,40 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+          summary: MON/OSD operations are slow to complete
            description: >
              {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
  # cephadm alerts
    - name: cephadm
      rules:
-      - alert: Cluster upgrade has failed
+      - alert: CephadmUpgradeFailed
          expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
          for: 30s
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.11.2
          annotations:
+          summary: Ceph version upgrade has failed
            description: >
              The cephadm cluster upgrade process has failed. The cluster remains in
              an undetermined state.
  
              Please review the cephadm logs, to understand the nature of the issue
-      - alert: A daemon managed by cephadm is down
+      - alert: CephadmDaemonFailed
          expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
          for: 30s
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.11.1
          annotations:
+          summary: A ceph daemon manged by cephadm is down
            description: >
              A daemon managed by cephadm is no longer active. Determine, which
              daemon is down with 'ceph health detail'. you may start daemons with
              the 'ceph orch daemon start <daemon_id>'
-      - alert: cephadm management has been paused
+      - alert: CephadmPaused
          expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
          for: 1m
          labels:
@@ -700,21 +830,24 @@ groups:
            type: ceph_default
          annotations:
            documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
+          summary: Orchestration tasks via cephadm are PAUSED
            description: >
              Cluster management has been paused manually. This will prevent the
              orchestrator from service management and reconciliation. If this is
              not intentional, resume cephadm operations with 'ceph orch resume'
  
  # prometheus alerts
-  - name: prometheus
+  - name: PrometheusServer
      rules:
-      - alert: Scrape job is missing
+      - alert: PrometheusJobMissing
          expr: absent(up{job="ceph"})
          for: 30s
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.12.1
          annotations:
+          summary: The scrape job for Ceph is missing from Prometheus
            description: |
              The prometheus job that scrapes from Ceph is no longer defined, this
              will effectively mean you'll have no metrics or alerts for the cluster.
@@ -724,15 +857,34 @@ groups:
  # Object related events
    - name: rados
      rules:
-      - alert: Data not found/missing
+      - alert: CephObjectMissing
          expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
          for: 30s
          labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.10.1
          annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
+          summary: Object(s) has been marked UNFOUND
            description: |
              A version of a RADOS object can not be found, even though all OSDs are up. I/O
              requests for this object from clients will block (hang). Resolving this issue may
-            require the object to be rolled back to a prior version manually, and manually verified.
-\ No newline at end of file
+            require the object to be rolled back to a prior version manually, and manually verified.
+# Generic
+  - name: generic
+    rules:
+      - alert: CephDaemonCrash
+        expr: ceph_health_detail{name="RECENT_CRASH"} == 1
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.1.2
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
+          summary: One or more Ceph daemons have crashed, and are pending acknowledgement
+          description: |
+            One or more daemons have crashed recently, and need to be acknowledged. This notification
+            ensures that software crashes don't go unseen. To acknowledge a crash, use the
+            'ceph crash archive <id>' command.
+\ No newline at end of file
diff --git a/monitoring/prometheus/tests/test_alerts.yml b/monitoring/prometheus/tests/test_alerts.yml

index ab423c983d0c7affffc35900fad74e1ec4722f0d..cd980deb39b3c81c2a2c59557d0f77f459b098f1 100644 (file)
--- a/monitoring/prometheus/tests/test_alerts.yml
+++ b/monitoring/prometheus/tests/test_alerts.yml
@@ -15,17 +15,18 @@ tests:
           value: 2
     alert_rule_test:
      - eval_time: 1m
-      alertname: health error
+      alertname: CephHealthError
      - eval_time: 6m
-      alertname: health error
+      alertname: CephHealthError
        exp_alerts:
        - exp_labels:
            instance: ceph:9283
            job: ceph
-          oid: 1.3.6.1.4.1.50495.15.1.2.2.1
+          oid: 1.3.6.1.4.1.50495.1.2.1.2.1
            type: ceph_default
            severity: critical
          exp_annotations:
+          summary: Cluster is in an ERROR state
            description: >
              Ceph in HEALTH_ERROR state for more than 5 minutes.
              Please check "ceph health detail" for more information.
@@ -43,72 +44,21 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 10m
-      alertname: health warn
+      alertname: CephHealthWarning
      - eval_time: 20m
-      alertname: health warn
+      alertname: CephHealthWarning
        exp_alerts:
        - exp_labels:
            instance: ceph:9283
            job: ceph
-          oid: 1.3.6.1.4.1.50495.15.1.2.2.2
            type: ceph_default
            severity: warning
          exp_annotations:
+          summary: Cluster is in a WARNING state
            description: >
              Ceph has been in HEALTH_WARN for more than 15 minutes.
              Please check "ceph health detail" for more information.
  
- # low monitor quorum count
-#  - interval: 1m
-#    input_series:
-#     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283",
-#       job="ceph"}'
-#       values: '1 1 1 1 1'
-#     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283",
-#       job="ceph"}'
-#       values: '1 1 1 1 1'
-#     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283",
-#       job="ceph"}'
-#       values: '0 0 0 0 0'
-#     - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version
-#       17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
-#       (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
-#       public_addr="172.20.0.2",rank="0"}'
-#       values: '1 1 1 1 1'
-#     - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version
-#       17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
-#       (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
-#       public_addr="172.20.0.2",rank="1"}'
-#       values: '1 1 1 1 1'
-#     - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version
-#       17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
-#       (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
-#       public_addr="172.20.0.2",rank="2"}'
-#       values: '1 1 1 1 1'
-#    promql_expr_test:
-#      - expr: sum(ceph_mon_quorum_status) < 3
-#        eval_time: 1m
-#        exp_samples:
-#          - labels: '{}'
-#            value: 2
-#    alert_rule_test:
-#     - eval_time: 1m
-#       alertname: low monitor quorum count
-#       exp_alerts:
-#       - exp_labels:
-#           oid: 1.3.6.1.4.1.50495.15.1.2.3.1
-#           type: ceph_default
-#           severity: critical
-#         exp_annotations:
-#           description: |
-#             Monitor count in quorum is below three.
-
-#             Only 2 of 3 monitors are active.
-
-#             The following monitors are down:
-#               - mon.c on ceph
-
-
   # 10% OSDs down
   - interval: 1m
     input_series:
@@ -147,156 +97,20 @@ tests:
             value: 3.333333333333333E+01
     alert_rule_test:
       - eval_time: 1m
-       alertname: 10% OSDs down
+       alertname: CephOSDDownHigh
         exp_alerts:
         - exp_labels:
-           oid: 1.3.6.1.4.1.50495.15.1.2.4.1
+           oid: 1.3.6.1.4.1.50495.1.2.1.4.1
             type: ceph_default
             severity: critical
           exp_annotations:
+           summary: More than 10% of OSDs are down
             description: |
-             33.33% or 1 of 3 OSDs are down (≥ 10%).
+             33.33% or 1 of 3 OSDs are down (>= 10%).
  
               The following OSDs are down:
                 - osd.1 on ceph
  
- # OSD down
-#  - interval: 1m
-#    input_series:
-#     - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
-#       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-#     - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
-#       values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
-#     - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
-#       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-#       ceph_version="ceph version 17.0.0-189-g3558fd72
-#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-#       public_addr="172.20.0.2"}'
-#       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-#       ceph_version="ceph version 17.0.0-189-g3558fd72
-#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-#       public_addr="172.20.0.2"}'
-#       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-#       ceph_version="ceph version 17.0.0-189-g3558fd72
-#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-#       public_addr="172.20.0.2"}'
-#       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-#    promql_expr_test:
-#      - expr: count(ceph_osd_up == 0) > 0
-#        eval_time: 1m
-#        exp_samples:
-#          - labels: '{}'
-#            value: 1
-#    alert_rule_test:
-#      - eval_time: 15m
-#        alertname: OSD down
-#        exp_alerts:
-#        - exp_labels:
-#            oid: 1.3.6.1.4.1.50495.15.1.2.4.2
-#            type: ceph_default
-#            severity: warning
-#          exp_annotations:
-#            description: |
-
-#              1 OSD down for more than 15 minutes.
-
-#              1 of 3 OSDs are down.
-
-#              The following OSD is down:
-#                  - osd.1 on ceph
-
-# OSDs near full
-#  - interval: 1m
-#    input_series:
-#     - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283"
-#       ,job="ceph"}'
-#       values: '1076310016 1076310016 1076310016 1076310016 1076310016
-#       1076310016'
-#     - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283"
-#       ,job="ceph"}'
-#       values: '1076310016 1076310016 1076310016 1076310016 1076310016
-#       1076310016'
-#     - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283"
-#       ,job="ceph"}'
-#       values: '1076310016 1076310016 1076310016 1076310016 1076310016
-#       100856561909.76'
-#     - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283"
-#       ,job="ceph"}'
-#       values: '108447916032 108447916032 108447916032 108447916032 108447916032
-#       108447916032'
-#     - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283"
-#       ,job="ceph"}'
-#       values: '108447916032 108447916032 108447916032 108447916032 108447916032
-#       108447916032'
-#     - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283"
-#       ,job="ceph"}'
-#       values: '108447916032 108447916032 108447916032 108447916032 108447916032
-#       108447916032'
-#     - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
-#       values: '1 1 1 1 1 1'
-#     - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
-#       values: '1 1 1 1 1 1'
-#     - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
-#       values: '1 1 1 1 1 1'
-#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-#       ceph_version="ceph version 17.0.0-189-g3558fd72
-#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-#       public_addr="172.20.0.2"}'
-#       values: '1 1 1 1 1 1'
-#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-#       ceph_version="ceph version 17.0.0-189-g3558fd72
-#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-#       public_addr="172.20.0.2"}'
-#       values: '1 1 1 1 1 1'
-#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-#       ceph_version="ceph version 17.0.0-189-g3558fd72
-#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-#       public_addr="172.20.0.2"}'
-#       values: '1 1 1 1 1 1'
-#    promql_expr_test:
-#      - expr: |
-#          (
-#            ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon)
-#            ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname)
-#            ceph_osd_metadata
-#          ) * 100 > 90
-
-#        eval_time: 5m
-#        exp_samples:
-#          - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283",
-#            job="ceph"}'
-#            value: 9.3E+01
-#    alert_rule_test:
-#      - eval_time: 10m
-#        alertname: OSDs near full
-#        exp_alerts:
-#        - exp_labels:
-#            ceph_daemon: osd.2
-#            hostname: ceph
-#            instance: ceph:9283
-#            job: ceph
-#            oid: 1.3.6.1.4.1.50495.15.1.2.4.3
-#            type: ceph_default
-#            severity: critical
-#          exp_annotations:
-#            description: >
-#              OSD osd.2 on ceph is dangerously full: 93%
-
   # flapping OSD
   - interval: 1s
     input_series:
@@ -340,18 +154,19 @@ tests:
             value: 1.2200000000000001E+01
     alert_rule_test:
       - eval_time: 5m
-       alertname: Flapping OSD
+       alertname: CephOSDFlapping
         exp_alerts:
         - exp_labels:
             ceph_daemon: osd.0
             hostname: ceph
             instance: ceph:9283
             job: ceph
-           oid: 1.3.6.1.4.1.50495.15.1.2.4.4
+           oid: 1.3.6.1.4.1.50495.1.2.1.4.4
             severity: warning
             type: ceph_default
           exp_annotations:
             documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
+           summary: Network issues are causing OSD's to flap (mark each other out)
             description: >
                OSD osd.0 on ceph was
                marked down and back up at 20.1 times once a minute for 5 minutes.
@@ -418,17 +233,18 @@ tests:
             value: 6E-01
     alert_rule_test:
       - eval_time: 10m
-       alertname: high pg count deviation
+       alertname: CephPGImbalance
         exp_alerts:
         - exp_labels:
             ceph_daemon: osd.1
             hostname: ceph
             instance: ceph:9283
             job: ceph
-           oid: 1.3.6.1.4.1.50495.15.1.2.4.5
+           oid: 1.3.6.1.4.1.50495.1.2.1.4.5
             severity: warning
             type: ceph_default
           exp_annotations:
+           summary: PG allocations are not balanced across devices
             description: >
                OSD osd.1 on ceph deviates
                by more than 30% from average PG count.
@@ -468,17 +284,18 @@ tests:
             value: 1
     alert_rule_test:
       - eval_time: 5m
-       alertname: pgs inactive
+       alertname: CephPGsInactive
         exp_alerts:
         - exp_labels:
             instance: ceph:9283
             job: ceph
             name: device_health_metrics
-           oid: 1.3.6.1.4.1.50495.15.1.2.7.1
+           oid: 1.3.6.1.4.1.50495.1.2.1.7.1
             pool_id: 3
             severity: critical
             type: ceph_default
           exp_annotations:
+           summary: One or more Placement Groups are inactive
             description: >
                1 PGs have been inactive for more than 5 minutes in pool
                device_health_metrics.
@@ -523,17 +340,18 @@ tests:
             value: 1
     alert_rule_test:
       - eval_time: 16m
-       alertname: pgs unclean
+       alertname: CephPGsUnclean
         exp_alerts:
         - exp_labels:
             instance: ceph:9283
             job: ceph
             name: device_health_metrics
-           oid: 1.3.6.1.4.1.50495.15.1.2.7.2
+           oid: 1.3.6.1.4.1.50495.1.2.1.7.2
             pool_id: 3
             severity: warning
             type: ceph_default
           exp_annotations:
+           summary: One or more platcment groups are marked unclean
             description: >
                1 PGs haven't been clean for more than 15 minutes in pool
                device_health_metrics.
@@ -564,7 +382,7 @@ tests:
             value: 4.8E+00
     alert_rule_test:
       - eval_time: 10m
-       alertname: root volume full
+       alertname: CephNodeRootFilesystemFull
         exp_alerts:
         - exp_labels:
             device: /dev/mapper/fedora_localhost --live-home
@@ -572,12 +390,13 @@ tests:
             instance: node-exporter
             job: node-exporter
             mountpoint: /
-           oid: 1.3.6.1.4.1.50495.15.1.2.8.1
+           oid: 1.3.6.1.4.1.50495.1.2.1.8.1
             severity: critical
             type: ceph_default
           exp_annotations:
+           summary: Root filesystem is dangerously full
             description: >
-              Root volume (OSD and MON store) is dangerously full: 4.811% free.
+             Root volume (OSD and MON store) is dangerously full: 4.811% free.
  
   # network packets dropped
   - interval: 1s
@@ -608,19 +427,20 @@ tests:
             value: 1.2E+02
     alert_rule_test:
       - eval_time: 5m
-       alertname: network packets dropped
+       alertname: CephNodeNetworkPacketDrops
         exp_alerts:
         - exp_labels:
             device: eth0
             instance: node-exporter
             job: node-exporter
-           oid: 1.3.6.1.4.1.50495.15.1.2.8.2
+           oid: 1.3.6.1.4.1.50495.1.2.1.8.2
             severity: warning
             type: ceph_default
           exp_annotations:
+           summary: One or more Nics is seeing packet drops
             description: >
-              Node node-exporter experiences packet drop > 0.01% or >
-              10 packets/s on interface eth0.
+             Node node-exporter experiences packet drop > 0.01% or >
+             10 packets/s on interface eth0.
  
   # network packets errors
   - interval: 1s
@@ -651,20 +471,63 @@ tests:
             value: 1.2E+02
     alert_rule_test:
       - eval_time: 5m
-       alertname: network packet errors
+       alertname: CephNodeNetworkPacketErrors
         exp_alerts:
         - exp_labels:
             device: eth0
             instance: node-exporter
             job: node-exporter
-           oid: 1.3.6.1.4.1.50495.15.1.2.8.3
+           oid: 1.3.6.1.4.1.50495.1.2.1.8.3
             severity: warning
             type: ceph_default
           exp_annotations:
+           summary: One or more Nics is seeing packet errors
             description: >
-              Node node-exporter experiences packet errors > 0.01% or > 10
-              packets/s on interface eth0.
+             Node node-exporter experiences packet errors > 0.01% or > 10
+             packets/s on interface eth0.
  
+# Node Storage disk space filling up
+ - interval: 1m
+   # 20GB = 21474836480, 256MB = 268435456
+   input_series:
+    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
+      fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
+      values: '21474836480-268435456x48'
+    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
+      fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
+      values: '21474836480+0x48'
+    - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
+      values: 1+0x48
+    - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
+      values: 1+0x48
+   promql_expr_test:
+     - expr: |
+         predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
+          on(instance) group_left(nodename) node_uname_info < 0
+       eval_time: 5m
+       exp_samples:
+         - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
+         mountpoint="/rootfs",nodename="node-1.unittests.com"}'
+           value: -1.912602624E+12
+   alert_rule_test:
+     - eval_time: 5m
+       alertname: CephNodeDiskspaceWarning
+       exp_alerts:
+       - exp_labels:
+           severity: warning
+           type: ceph_default
+           oid: 1.3.6.1.4.1.50495.1.2.1.8.4
+           device: /dev/mapper/vg-root
+           fstype: xfs
+           instance: node-1
+           mountpoint: /rootfs
+           nodename: node-1.unittests.com
+         exp_annotations:
+           summary: Host filesystem freespace is getting low
+           description: >
+             Mountpoint /rootfs on node-1.unittests.com
+             will be full in less than 5 days assuming the average fill-up
+             rate of the past 48 hours.
   # MTU Mismatch
   - interval: 1m
     input_series:
@@ -707,72 +570,88 @@ tests:
             value: 9000
     alert_rule_test:
       - eval_time: 1m
-       alertname: MTU Mismatch
+       alertname: CephNodeInconsistentMTU
         exp_alerts:
         - exp_labels:
             device: eth4
             instance: node-exporter
             job: node-exporter
-           oid: 1.3.6.1.4.1.50495.15.1.2.8.5
             severity: warning
             type: ceph_default
           exp_annotations:
+           summary: MTU settings across Ceph hosts are inconsistent
             description: >
                 Node node-exporter has a different MTU size (9000)
                 than the median value on device eth4.
  
- # pool full
+ # pool full, data series has 6 but using topk(5) so to ensure the
+ # results are working as expected
   - interval: 1m
     input_series:
-    - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="1"}'
-      values: '0 0 0 0 0 0 0 0 0'
-    - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="2"}'
-      values: '1850 1850 1850 1850 1850 1850 1850'
-    - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="3"}'
-      values: '900 900 23524 23524 23524 23524 23524 23524
-      23524'
-    - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="1"}'
-      values: '106287063040 106287063040 106287063040 106287063040 106287063040
-      106287063040 106287063040'
-    - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="2"}'
-      values: '106287063040 106287063040 106287063040 106287063040 106287063040
-      106287063040 106287063040'
-    - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="3"}'
-      values: '37.5 37.5 37.5 37.5 37.5 37.5 37.5'
+    - series: 'ceph_health_detail{name="POOL_FULL"}'
+      values: '0 0 0 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="1"}'
+      values: '32+0x10'
+    - series: 'ceph_pool_percent_used{pool_id="2"}'
+      values: '96+0x10'
+    - series: 'ceph_pool_percent_used{pool_id="3"}'
+      values: '90+0x10'
+    - series: 'ceph_pool_percent_used{pool_id="4"}'
+      values: '72+0x10'
+    - series: 'ceph_pool_percent_used{pool_id="5"}'
+      values: '19+0x10'
+    - series: 'ceph_pool_percent_used{pool_id="6"}'
+      values: '10+0x10'
      - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="1"}'
+      name="cephfs_data",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="rbd",pool_id="2"}'
        values: '1 1 1 1 1 1 1 1 1'
      - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name=".rgw.root",pool_id="2"}'
+      name="iscsi",pool_id="3"}'
        values: '1 1 1 1 1 1 1 1 1'
      - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="default.rgw.log",pool_id="3"}'
+      name="default.rgw.index",pool_id="4"}'
+      values: '1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="default.rgw.log",pool_id="5"}'
+      values: '1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="dummy",pool_id="6"}'
        values: '1 1 1 1 1 1 1 1 1'
     promql_expr_test:
-     - expr: |
-         ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
-         * on(pool_id) group_right ceph_pool_metadata * 100 > 90
-
-       eval_time: 1m
+     - expr: ceph_health_detail{name="POOL_FULL"} > 0
+       eval_time: 5m
         exp_samples:
-         - labels: '{instance="ceph:9283", job="ceph", name="default.rgw.log",
-           pool_id="3"}'
-           value: 9.6E+01
+         - labels:  '{__name__="ceph_health_detail", name="POOL_FULL"}'
+           value: 1
     alert_rule_test:
-     - eval_time: 2m
-       alertname: pool full
+     - eval_time: 1m
+       alertname: CephPoolFull
+     - eval_time: 10m
+       alertname: CephPoolFull
         exp_alerts:
         - exp_labels:
-           instance: ceph:9283
-           job: ceph
-           name: default.rgw.log
-           oid: 1.3.6.1.4.1.50495.15.1.2.9.1
-           pool_id: 3
+           name: POOL_FULL
             severity: critical
             type: ceph_default
+           oid: 1.3.6.1.4.1.50495.1.2.1.9.1
           exp_annotations:
-           description: Pool default.rgw.log at 96% capacity.
-
+           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
+           summary: Pool is full - writes are blocked
+           description: |
+             A pool has reached it's MAX quota, or the OSDs supporting the pool
+             have reached their FULL threshold. Until this is resolved, writes to
+             the pool will be blocked.
+             Pool Breakdown (top 5)
+               - rbd at 96%
+               - iscsi at 90%
+               - default.rgw.index at 72%
+               - cephfs_data at 32%
+               - default.rgw.log at 19%
+             Either increase the pools quota, or add capacity to the cluster first
+             then increase it's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
   # slow OSD ops
   - interval : 1m
     input_series:
@@ -787,7 +666,7 @@ tests:
             value: 1
     alert_rule_test:
       - eval_time: 20m
-       alertname: Slow OSD Ops
+       alertname: CephSlowOps
         exp_alerts:
         - exp_labels:
             instance: ceph:9283
@@ -796,6 +675,7 @@ tests:
             type: ceph_default
           exp_annotations:
             documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+           summary: MON/OSD operations are slow to complete
             description: >
               1 OSD requests are taking too long to process
               (osd_op_complaint_time exceeded)
@@ -813,15 +693,17 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Cluster upgrade has failed
+      alertname: CephadmUpgradeFailed
      - eval_time: 5m
-      alertname: Cluster upgrade has failed
+      alertname: CephadmUpgradeFailed
        exp_alerts:
        - exp_labels:
            name: UPGRADE_EXCEPTION
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.11.2
          exp_annotations:
+          summary: Ceph version upgrade has failed
            description: >
              The cephadm cluster upgrade process has failed. The cluster remains in
              an undetermined state.
@@ -839,15 +721,17 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: A daemon managed by cephadm is down
+      alertname: CephadmDaemonFailed
      - eval_time: 5m
-      alertname: A daemon managed by cephadm is down
+      alertname: CephadmDaemonFailed
        exp_alerts:
        - exp_labels:
            name: CEPHADM_FAILED_DAEMON
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.11.1
          exp_annotations:
+          summary: A ceph daemon manged by cephadm is down
            description: >
              A daemon managed by cephadm is no longer active. Determine, which
              daemon is down with 'ceph health detail'. you may start daemons with
@@ -864,9 +748,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: cephadm management has been paused
+      alertname: CephadmPaused
      - eval_time: 5m
-      alertname: cephadm management has been paused
+      alertname: CephadmPaused
        exp_alerts:
        - exp_labels:
            name: CEPHADM_PAUSED
@@ -874,6 +758,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
+          summary: Orchestration tasks via cephadm are PAUSED
            description: >
              Cluster management has been paused manually. This will prevent the
              orchestrator from service management and reconciliation. If this is
@@ -891,16 +776,18 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Ceph Filesystem damage detected
+      alertname: CephFilesystemDamaged
      - eval_time: 5m
-      alertname: Ceph Filesystem damage detected
+      alertname: CephFilesystemDamaged
        exp_alerts:
        - exp_labels:
            name: MDS_DAMAGE
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.1
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
+          summary: Ceph filesystem is damaged.
            description: >
              The filesystems metadata has been corrupted. Data access
              may be blocked.
@@ -919,22 +806,161 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Ceph Filesystem switched to READ ONLY
+      alertname: CephFilesystemReadOnly
      - eval_time: 5m
-      alertname: Ceph Filesystem switched to READ ONLY
+      alertname: CephFilesystemReadOnly
        exp_alerts:
        - exp_labels:
            name: MDS_HEALTH_READ_ONLY
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.2
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
+          summary: Ceph filesystem in read only mode, due to write error(s)
            description: >
              The filesystem has switched to READ ONLY due to an unexpected
              write error, when writing to the metadata pool
  
              Either analyse the output from the mds daemon admin socket, or
              escalate to support
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
+      values: '0 0 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephFilesystemOffline
+    - eval_time: 10m
+      alertname: CephFilesystemOffline
+      exp_alerts:
+      - exp_labels:
+          name: MDS_ALL_DOWN
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.3
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
+          summary: Ceph filesystem is offline
+          description: >
+            All MDS ranks are unavailable. The ceph daemons providing the metadata
+            for the Ceph filesystem are all down, rendering the filesystem offline.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="FS_DEGRADED"}'
+      values: '0 0 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephFilesystemDegraded
+    - eval_time: 10m
+      alertname: CephFilesystemDegraded
+      exp_alerts:
+      - exp_labels:
+          name: FS_DEGRADED
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.4
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
+          summary: Ceph filesystem is degraded
+          description: >
+            One or more metdata daemons (MDS ranks) are failed or in a
+            damaged state. At best the filesystem is partially available,
+            worst case is the filesystem is completely unusable.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
+      values: '0 0 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephFilesystemInsufficientStandby
+    - eval_time: 10m
+      alertname: CephFilesystemInsufficientStandby
+      exp_alerts:
+      - exp_labels:
+          name: MDS_INSUFFICIENT_STANDBY
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
+          summary: Ceph filesystem standby daemons too low
+          description: >
+            The minimum number of standby daemons determined by standby_count_wanted
+            is less than the actual number of standby daemons. Adjust the standby count
+            or increase the number of mds daemons within the filesystem.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
+      values: '0 0 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephFilesystemFailureNoStandby
+    - eval_time: 10m
+      alertname: CephFilesystemFailureNoStandby
+      exp_alerts:
+      - exp_labels:
+          name: FS_WITH_FAILED_MDS
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.5.5
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
+          summary: Ceph MDS daemon failed, no further standby available
+          description: >
+            An MDS daemon has failed, leaving only one active rank without
+            further standby. Investigate the cause of the failure or add a
+            standby daemon
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
+      values: '0 0 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephFilesystemMDSRanksLow
+    - eval_time: 10m
+      alertname: CephFilesystemMDSRanksLow
+      exp_alerts:
+      - exp_labels:
+          name: MDS_UP_LESS_THAN_MAX
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
+          summary: Ceph MDS daemon count is lower than configured
+          description: >
+            The filesystem's "max_mds" setting defined the number of MDS ranks in
+            the filesystem. The current number of active MDS daemons is less than
+            this setting.
  # MGR
   - interval: 1m
     input_series:
@@ -948,16 +974,18 @@ tests:
             value: 0
     alert_rule_test:
      - eval_time: 1m
-      alertname: mgr prometheus module is not active
+      alertname: CephMgrPrometheusModuleInactive
      - eval_time: 10m
-      alertname: mgr prometheus module is not active
+      alertname: CephMgrPrometheusModuleInactive
        exp_alerts:
        - exp_labels:
            instance: ceph-mgr:9283
            job: ceph
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.6.2
          exp_annotations:
+          summary: Ceph's mgr/prometheus module is not available
            description: >
              The mgr/prometheus module at ceph-mgr:9283 is unreachable. This
              could mean that the module has been disabled or the mgr itself is down.
@@ -979,16 +1007,18 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: mgr module failure
+      alertname: CephMgrModuleCrash
      - eval_time: 15m
-      alertname: mgr module failure
+      alertname: CephMgrModuleCrash
        exp_alerts:
        - exp_labels:
            name: RECENT_MGR_MODULE_CRASH
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.6.1
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
+          summary: A mgr module has recently crashed
            description: >
              One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A
              crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to
@@ -1008,16 +1038,18 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Ceph mon disk space critically low
+      alertname: CephMonDiskspaceCritical
      - eval_time: 10m
-      alertname: Ceph mon disk space critically low
+      alertname: CephMonDiskspaceCritical
        exp_alerts:
        - exp_labels:
            name: "MON_DISK_CRIT"
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.3.2
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
+          summary: Disk space on at least one monitor is critically low
            description: |
              The free space available to a monitor's store is critically low (<5% by default).
              You should increase the space available to the monitor(s). The
@@ -1037,9 +1069,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Ceph mon disk space running low
+      alertname: CephMonDiskspaceLow
      - eval_time: 10m
-      alertname: Ceph mon disk space running low
+      alertname: CephMonDiskspaceLow
        exp_alerts:
        - exp_labels:
            name: "MON_DISK_LOW"
@@ -1047,6 +1079,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
+          summary: Disk space on at least one monitor is approaching full
            description: |
              The space available to a monitor's store is approaching full (>70% is the default).
              You should increase the space available to the monitor store. The
@@ -1064,9 +1097,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Clock skew detected across Ceph Monitor daemons
+      alertname: CephMonClockSkew
      - eval_time: 10m
-      alertname: Clock skew detected across Ceph Monitor daemons
+      alertname: CephMonClockSkew
        exp_alerts:
        - exp_labels:
            name: "MON_CLOCK_SKEW"
@@ -1074,6 +1107,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
+          summary: Clock skew across the Monitor hosts detected
            description: |
              The ceph monitors rely on a consistent time reference to maintain
              quorum and cluster consistency. This event indicates that at least
@@ -1107,17 +1141,18 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Monitor down, quorum is at risk
+      alertname: CephMonDownQuorumAtRisk
        # shouldn't fire
      - eval_time: 10m
-      alertname: Monitor down, quorum is at risk
+      alertname: CephMonDownQuorumAtRisk
        exp_alerts:
        - exp_labels:
            severity: critical
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.3.1
+          oid: 1.3.6.1.4.1.50495.1.2.1.3.1
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
+          summary: Monitor quorum is at risk
            description: |
              Quorum requires a majority of monitors (x 2) to be active
              Without quorum the cluster will become inoperable, affecting all connected clients and services.
@@ -1155,15 +1190,16 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Monitor down
+      alertname: CephMonDown
      - eval_time: 10m
-      alertname: Monitor down
+      alertname: CephMonDown
        exp_alerts:
        - exp_labels:
            severity: warning
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
+          summary: One of more ceph monitors are down
            description: |
              You have 1 monitor down.
              Quorum is still intact, but the loss of further monitors will make your cluster inoperable.
@@ -1183,9 +1219,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Device failure predicted
+      alertname: CephDeviceFailurePredicted
      - eval_time: 10m
-      alertname: Device failure predicted
+      alertname: CephDeviceFailurePredicted
        exp_alerts:
        - exp_labels:
            name: "DEVICE_HEALTH"
@@ -1193,6 +1229,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
+          summary: Device(s) have been predicted to fail soon
            description: |
              The device health module has determined that one or more devices will fail
              soon. To review the device states use 'ceph device ls'. To show a specific
@@ -1212,16 +1249,18 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Too many devices predicted to fail
+      alertname: CephDeviceFailurePredictionTooHigh
      - eval_time: 10m
-      alertname: Too many devices predicted to fail
+      alertname: CephDeviceFailurePredictionTooHigh
        exp_alerts:
        - exp_labels:
            name: "DEVICE_HEALTH_TOOMANY"
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.7
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
+          summary: Too many devices have been predicted to fail, unable to resolve
            description: |
              The device health module has determined that the number of devices predicted to
              fail can not be remediated automatically, since it would take too many osd's out of
@@ -1239,9 +1278,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Device failure predicted, but automatic drain is incomplete
+      alertname: CephDeviceFailureRelocationIncomplete
      - eval_time: 10m
-      alertname: Device failure predicted, but automatic drain is incomplete
+      alertname: CephDeviceFailureRelocationIncomplete
        exp_alerts:
        - exp_labels:
            name: "DEVICE_HEALTH_IN_USE"
@@ -1249,6 +1288,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
+          summary: A device failure is predicted, but unable to relocate data
            description: |
              The device health module has determined that one or more devices will fail
              soon, but the normal process of relocating the data on the device to other
@@ -1274,15 +1314,17 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD Host is down
+      alertname: CephOSDHostDown
      - eval_time: 10m
-      alertname: OSD Host is down
+      alertname: CephOSDHostDown
        exp_alerts:
        - exp_labels:
            name: "OSD_HOST_DOWN"
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.8
          exp_annotations:
+          summary: An OSD host is offline
            description: |
              The following OSDs are down:
              - ceph-osd-1 : osd.0
@@ -1298,15 +1340,16 @@ tests:
             value: 0
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD hearbeats running slow (frontend)
+      alertname: CephOSDTimeoutsPublicNetwork
      - eval_time: 10m
-      alertname: OSD hearbeats running slow (frontend)
+      alertname: CephOSDTimeoutsPublicNetwork
        exp_alerts:
        - exp_labels:
            name: "OSD_SLOW_PING_TIME_FRONT"
            severity: warning
            type: ceph_default
          exp_annotations:
+          summary: Network issues delaying OSD heartbeats (public network)
            description: |
              OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
              for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
@@ -1322,15 +1365,16 @@ tests:
             value: 0
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD hearbeats running slow (backend)
+      alertname: CephOSDTimeoutsClusterNetwork
      - eval_time: 10m
-      alertname: OSD hearbeats running slow (backend)
+      alertname: CephOSDTimeoutsClusterNetwork
        exp_alerts:
        - exp_labels:
            name: "OSD_SLOW_PING_TIME_BACK"
            severity: warning
            type: ceph_default
          exp_annotations:
+          summary: Network issues delaying OSD heartbeats (cluster network)
            description: |
              OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
              for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
@@ -1346,9 +1390,9 @@ tests:
             value: 0
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD disk size mismatch
+      alertname: CephOSDInternalDiskSizeMismatch
      - eval_time: 10m
-      alertname: OSD disk size mismatch
+      alertname: CephOSDInternalDiskSizeMismatch
        exp_alerts:
        - exp_labels:
            name: "BLUESTORE_DISK_SIZE_MISMATCH"
@@ -1356,6 +1400,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
+          summary: OSD size inconsistency error
            description: |
              One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata.
              This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs.
@@ -1371,9 +1416,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD Read errors
+      alertname: CephOSDReadErrors
      - eval_time: 10m
-      alertname: OSD Read errors
+      alertname: CephOSDReadErrors
        exp_alerts:
        - exp_labels:
            name: "BLUESTORE_SPURIOUS_READ_ERRORS"
@@ -1381,6 +1426,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
+          summary: Device read errors detected
            description: >
              An OSD has encountered read errors, but the OSD has recovered by retrying
              the reads. This may indicate an issue with the Hardware or Kernel.
@@ -1408,17 +1454,18 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD down
+      alertname: CephOSDDown
      - eval_time: 10m
-      alertname: OSD down
+      alertname: CephOSDDown
        exp_alerts:
        - exp_labels:
            name: "OSD_DOWN"
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.4.2
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.2
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
+          summary: An OSD has been marked down/unavailable
            description: |
              1 OSD down for over 5mins.
  
@@ -1436,23 +1483,53 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSDs near full
+      alertname: CephOSDNearFull
      - eval_time: 10m
-      alertname: OSDs near full
+      alertname: CephOSDNearFull
        exp_alerts:
        - exp_labels:
            name: "OSD_NEARFULL"
            severity: warning
            type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.4.3
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.3
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
+          summary: OSD(s) running low on free space (NEARFULL)
            description: |
              One or more OSDs have reached their NEARFULL threshold
  
              Use 'ceph health detail' to identify which OSDs have reached this threshold.
              To resolve, either add capacity to the cluster, or delete unwanted data
   - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_FULL"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_FULL"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephOSDFull
+    - eval_time: 10m
+      alertname: CephOSDFull
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_FULL"
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.4.6
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
+          summary: OSD(s) is full, writes blocked
+          description: |
+            An OSD has reached it's full threshold. Writes from all pools that share the
+            affected OSD will be blocked.
+
+            To resolve, either add capacity to the cluster, or delete unwanted data
+ - interval: 1m
     input_series:
      - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
        values: '0+0x2 1+0x10'
@@ -1464,9 +1541,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD unable to perform rebalance
+      alertname: CephOSDBackfillFull
      - eval_time: 10m
-      alertname: OSD unable to perform rebalance
+      alertname: CephOSDBackfillFull
        exp_alerts:
        - exp_labels:
            name: "OSD_BACKFILLFULL"
@@ -1474,6 +1551,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
+          summary: OSD(s) too full for backfill operations
            description: |
              An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations
              completing for some pools. Check the current capacity utilisation with 'ceph df'
@@ -1491,9 +1569,9 @@ tests:
             value: 0
     alert_rule_test:
      - eval_time: 1m
-      alertname: OSD too many read repairs
+      alertname: CephOSDTooManyRepairs
      - eval_time: 10m
-      alertname: OSD too many read repairs
+      alertname: CephOSDTooManyRepairs
        exp_alerts:
        - exp_labels:
            name: "OSD_TOO_MANY_REPAIRS"
@@ -1501,67 +1579,72 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
+          summary: OSD has hit a high number of read errors
            description: |
              Reads from an OSD have used a secondary PG to return data to the client, indicating
              a potential failing disk.
  # Pools
- - interval: 1m
+   # trigger percent full prediction on pools 1 and 2 only
+ - interval: 12h
     input_series:
-    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
-      values: '0+0x2 1+0x10'
+    - series: 'ceph_pool_percent_used{pool_id="1"}'
+      values: '70 75 80 87 92'
+    - series: 'ceph_pool_percent_used{pool_id="2"}'
+      values: '22 22 23 23 24'
+    - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
+      values: '1 1 1 1 1'
     promql_expr_test:
-     - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
-       eval_time: 3m
+     - expr: |
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
+              group_right ceph_pool_metadata) >= 95
+       eval_time: 36h
         exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
-           value: 1
+         - labels: '{name="rbd",pool_id="1",type="replicated"}'
+           value: 1.424E+02 # 142%
     alert_rule_test:
-    - eval_time: 1m
-      alertname: Ceph pool is too full for recovery/rebalance
-    - eval_time: 5m
-      alertname: Ceph pool is too full for recovery/rebalance
+    - eval_time: 48h
+      alertname: CephPoolGrowthWarning
        exp_alerts:
        - exp_labels:
-          name: "POOL_BACKFILLFULL"
+          name: rbd
+          pool_id: 1
            severity: warning
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.9.2
          exp_annotations:
+          summary: Pool growth rate may soon exceed it's capacity
            description: >
-            A pool is approaching it's near full threshold, which will
-            prevent rebalance operations from completing. You should
-            consider adding more capacity to the pool.
-
+            Pool 'rbd' will be full in less than 5 days
+            assuming the average fill-up rate of the past 48 hours.
   - interval: 1m
     input_series:
-    - series: 'ceph_health_detail{name="POOL_FULL"}'
+    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
        values: '0+0x2 1+0x10'
     promql_expr_test:
-     - expr: ceph_health_detail{name="POOL_FULL"} == 1
+     - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
         eval_time: 3m
         exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Ceph pool is full - writes blocked
-    - eval_time: 10m
-      alertname: Ceph pool is full - writes blocked
+      alertname: CephPoolBackfillFull
+    - eval_time: 5m
+      alertname: CephPoolBackfillFull
        exp_alerts:
        - exp_labels:
-          name: "POOL_FULL"
-          severity: critical
+          name: "POOL_BACKFILLFULL"
+          severity: warning
            type: ceph_default
          exp_annotations:
-          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
-          description: |
-            A pool has reached it's MAX quota, or the OSDs supporting the pool
-            have reached their FULL threshold. Until this is resolved, writes to
-            the pool will be blocked.
+          summary: Freespace in a pool is too low for recovery/rebalance
+          description: >
+            A pool is approaching it's near full threshold, which will
+            prevent rebalance operations from completing. You should
+            consider adding more capacity to the pool.
  
-            Determine the affected pool with 'ceph df detail', for example looking
-            at QUOTA BYTES and STORED. Either increase the pools quota, or add
-            capacity to the cluster first then increase it's quota
-            (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
   - interval: 1m
     input_series:
      - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
@@ -1574,15 +1657,16 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Ceph pool is approaching full
+      alertname: CephPoolNearFull
      - eval_time: 10m
-      alertname: Ceph pool is approaching full
+      alertname: CephPoolNearFull
        exp_alerts:
        - exp_labels:
            name: "POOL_NEAR_FULL"
            severity: warning
            type: ceph_default
          exp_annotations:
+          summary: One or more Ceph pools are getting full
            description: |
              A pool has exceeeded it warning (percent full) threshold, or the OSDs
              supporting the pool have reached their NEARFULL thresholds. Writes may
@@ -1607,9 +1691,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Placement Group(s) have not been scrubbed
+      alertname: CephPGNotScrubbed
      - eval_time: 10m
-      alertname: Placement Group(s) have not been scrubbed
+      alertname: CephPGNotScrubbed
        exp_alerts:
        - exp_labels:
            name: "PG_NOT_SCRUBBED"
@@ -1617,6 +1701,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
+          summary: Placement group(s) have not been scrubbed
            description: |
              One or more PGs have not been scrubbed recently. The scrub process is a data integrity
              feature, protectng against bit-rot. It checks that objects and their metadata (size and
@@ -1625,6 +1710,67 @@ tests:
              scrub window.
  
              You can manually initiate a scrub with: ceph pg scrub <pgid>
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="PG_DAMAGED"}'
+      values: '0+0x4 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
+       eval_time: 5m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephPGsDamaged
+    - eval_time: 10m
+      alertname: CephPGsDamaged
+      exp_alerts:
+      - exp_labels:
+          name: "PG_DAMAGED"
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.4
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
+          summary: Placement group damaged, manual intervention needed
+          description: >
+            During data consistency checks (scrub), at least one PG has been flagged as being
+            damaged or inconsistent.
+
+            Check to see which PG is affected, and attempt a manual repair if neccessary. To list
+            problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use
+            the 'ceph pg repair <pg_num>' command.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
+      values: '0+0x4 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
+       eval_time: 5m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: CephPGsHighPerOSD
+    - eval_time: 10m
+      alertname: CephPGsHighPerOSD
+      exp_alerts:
+      - exp_labels:
+          name: "TOO_MANY_PGS"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
+          summary: Placement groups per OSD is too high
+          description: |
+            The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).
+
+            Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status'
+            and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide
+            the autoscaler based on the expected relative size of the pool
+            (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1')
   - interval: 1m
     input_series:
      - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
@@ -1637,16 +1783,18 @@ tests:
             value: 0
     alert_rule_test:
      - eval_time: 1m
-      alertname: Recovery at risk, cluster too full
+      alertname: CephPGRecoveryAtRisk
      - eval_time: 10m
-      alertname: Recovery at risk, cluster too full
+      alertname: CephPGRecoveryAtRisk
        exp_alerts:
        - exp_labels:
            name: "PG_RECOVERY_FULL"
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.5
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
+          summary: OSDs are too full for automatic recovery
            description: >
              Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their
              'full' threshold. Add more capacity to the cluster, or delete unwanted data.
@@ -1662,16 +1810,18 @@ tests:
             value: 0
     alert_rule_test:
      - eval_time: 1m
-      alertname: Cluster too full, automatic data recovery impaired
+      alertname: CephPGBackfillAtRisk
      - eval_time: 10m
-      alertname: Cluster too full, automatic data recovery impaired
+      alertname: CephPGBackfillAtRisk
        exp_alerts:
        - exp_labels:
            name: "PG_BACKFILL_FULL"
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.6
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
+          summary: Backfill operations are blocked, due to lack of freespace
            description: >
              Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
              have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
@@ -1689,22 +1839,24 @@ tests:
     alert_rule_test:
      # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
      - eval_time: 1m
-      alertname: I/O blocked to some data
+      alertname: CephPGUnavilableBlockingIO
        exp_alerts:
      # PG_AVAILABILITY firing, but osd_down is active .. no alert
      - eval_time: 5m
-      alertname: I/O blocked to some data
+      alertname: CephPGUnavilableBlockingIO
        exp_alerts:
      # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
      - eval_time: 15m
-      alertname: I/O blocked to some data
+      alertname: CephPGUnavilableBlockingIO
        exp_alerts:
        - exp_labels:
            name: "PG_AVAILABILITY"
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.7.3
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
+          summary: Placement group is unavailable, blocking some I/O
            description: >
              Data availability is reduced impacting the clusters abilty to service I/O to some data. One or
              more placement groups (PGs) are in a state that blocks IO.
@@ -1720,9 +1872,9 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 1m
-      alertname: Placement Group(s) have not been 'DEEP' scrubbed
+      alertname: CephPGNotDeepScrubbed
      - eval_time: 10m
-      alertname: Placement Group(s) have not been 'DEEP' scrubbed
+      alertname: CephPGNotDeepScrubbed
        exp_alerts:
        - exp_labels:
            name: "PG_NOT_DEEP_SCRUBBED"
@@ -1730,6 +1882,7 @@ tests:
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
+          summary: Placement group(s) have not been deep scrubbed
            description: |
              One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity
              feature, protectng against bit-rot. It compares the contents of objects and their
@@ -1752,13 +1905,15 @@ tests:
             value: 1
     alert_rule_test:
      - eval_time: 5m
-      alertname: Scrape job is missing
+      alertname: PrometheusJobMissing
        exp_alerts:
        - exp_labels:
            job: ceph
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.12.1
          exp_annotations:
+          summary: The scrape job for Ceph is missing from Prometheus
            description: |
              The prometheus job that scrapes from Ceph is no longer defined, this
              will effectively mean you'll have no metrics or alerts for the cluster.
@@ -1789,18 +1944,50 @@ tests:
     alert_rule_test:
      # OBJECT_UNFOUND but osd.2 is down, so don't fire
      - eval_time: 5m
-      alertname: Data not found/missing
+      alertname: CephObjectMissing
        exp_alerts:
      # OBJECT_UNFOUND and all osd's are online, so fire
      - eval_time: 15m
-      alertname: Data not found/missing
+      alertname: CephObjectMissing
        exp_alerts:
        - exp_labels:
            severity: critical
            type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.10.1
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
+          summary: Object(s) has been marked UNFOUND
            description: |
              A version of a RADOS object can not be found, even though all OSDs are up. I/O
              requests for this object from clients will block (hang). Resolving this issue may
-            require the object to be rolled back to a prior version manually, and manually verified.
-\ No newline at end of file
+            require the object to be rolled back to a prior version manually, and manually verified.
+# Generic Alerts
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="RECENT_CRASH"}'
+      values: '0 0 0 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
+       eval_time: 1m
+       exp_samples:
+   alert_rule_test:
+    # not firing
+    - eval_time: 1m
+      alertname: CephDaemonCrash
+      exp_alerts:
+    # firing
+    - eval_time: 10m
+      alertname: CephDaemonCrash
+      exp_alerts:
+      - exp_labels:
+          name: RECENT_CRASH
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.1.2.1.1.2
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
+          summary: One or more Ceph daemons have crashed, and are pending acknowledgement
+          description: |
+            One or more daemons have crashed recently, and need to be acknowledged. This notification
+            ensures that software crashes don't go unseen. To acknowledge a crash, use the
+            'ceph crash archive <id>' command.
+\ No newline at end of file
diff --git a/monitoring/prometheus/tests/validate_rules.py b/monitoring/prometheus/tests/validate_rules.py

index b424533242903893dd4441f16b3ba137c9269c4d..428779a47de87cf26ffccd8c05e8d4086e328a2c 100755 (executable)
--- a/monitoring/prometheus/tests/validate_rules.py
+++ b/monitoring/prometheus/tests/validate_rules.py
@@ -8,15 +8,20 @@
  #  8 .. Missing fields in YAML
  # 12 .. Invalid YAML - unable to load
  # 16 .. Missing input files
-
-# TODO: logging for debug
+#
+# Externals
+# snmptranslate .. used to determine the oid's in the MIB to verify the rule -> MIB is correct
+#
  
  import re
  import os
  import sys
  import yaml
+import shutil
+import string
  from bs4 import BeautifulSoup
  from typing import List, Any, Dict, Set, Optional, Tuple
+import subprocess
  
  import urllib.request
  import urllib.error
@@ -25,6 +30,15 @@ from urllib.parse import urlparse
  DOCLINK_NAME = 'documentation'
  DEFAULT_RULES_FILENAME = '../alerts/ceph_default_alerts.yml'
  DEFAULT_TEST_FILENAME = 'test_alerts.yml'
+MIB_FILE = '../../snmp/CEPH-MIB.txt'
+
+
+def isascii(s: str) -> bool:
+    try:
+        s.encode('ascii')
+    except UnicodeEncodeError:
+        return False
+    return True
  
  
  def read_file(file_name: str) -> Tuple[str, str]:
@@ -52,6 +66,14 @@ def load_yaml(file_name: str) -> Tuple[Dict[str, Any], str]:
      return data, errs
  
  
+def run_command(command: str):
+    c = command.split()
+    completion = subprocess.run(c, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return (completion.returncode,
+            completion.stdout.decode('utf-8').split('\n'),
+            completion.stderr.decode('utf-8').split('\n'))
+
+
  class HTMLCache:
      def __init__(self) -> None:
          self.cache: Dict[str, Tuple[int, str]] = {}
@@ -93,7 +115,6 @@ class PrometheusRule:
      expected_attrs = [
          'alert',
          'expr',
-        'for',
          'labels',
          'annotations'
      ]
@@ -106,11 +127,30 @@ class PrometheusRule:
          self.rule = rule_data
          self.errors: List[str] = []
          self.warnings: List[str] = []
-
          self.validate()
  
+    @property
+    def has_oid(self):
+        return True if self.rule.get('labels', {}).get('oid', '') else False
+
+    @property
+    def labels(self) -> Dict[str, str]:
+        return self.rule.get('labels', {})
+
+    @property
+    def annotations(self) -> Dict[str, str]:
+        return self.rule.get('annotations', {})
+
      def _check_alert_name(self):
-        pass
+        # this is simplistic, but works in the context of the alert name
+        if self.name[0] in string.ascii_uppercase and \
+          self.name != self.name.lower() and \
+          self.name != self.name.upper() and \
+          " " not in self.name and \
+          "_" not in self.name:
+            return
+
+        self.warnings.append("Alert name is not in CamelCase format")
  
      def _check_structure(self):
          rule_attrs = self.rule.keys()
@@ -123,17 +163,16 @@ class PrometheusRule:
  
      def _check_labels(self):
          for rqd in ['severity', 'type']:
-            if rqd not in self.rule.get('labels', ''):
+            if rqd not in self.labels.keys():
                  self.errors.append(f"rule is missing {rqd} label definition")
  
      def _check_annotations(self):
-        for rqd in ['description']:
-                if rqd not in self.rule.get('annotations', ''):
+        for rqd in ['summary', 'description']:
+                if rqd not in self.annotations:
                      self.errors.append(f"rule is missing {rqd} annotation definition")
  
      def _check_doclink(self):
-        annotations = self.rule.get('annotations', {})
-        doclink = annotations.get(DOCLINK_NAME, '')
+        doclink = self.annotations.get(DOCLINK_NAME, '')
  
          if doclink:
              url = urlparse(doclink)
@@ -148,20 +187,36 @@ class PrometheusRule:
                  self.errors.append(f"documentation link error: {status} {content}")
  
      def _check_snmp(self):
-        labels = self.rule.get('labels', {})
-        oid = labels.get('oid', '')
-        if labels.get('severity', '') == 'critical' and not oid:
+        oid = self.labels.get('oid', '')
+
+        if self.labels.get('severity', '') == 'critical' and not oid:
              self.warnings.append("critical level alert is missing an SNMP oid entry")
-        if oid and not re.search('^1.3.6.1.4.1.50495.15.1.2.\\d+.\\d+$', oid):
-            self.errors.append("invalid OID provided")
+        if oid and not re.search('^1.3.6.1.4.1.50495.1.2.\\d+.\\d+.\\d+$', oid):
+            self.errors.append("invalid OID format provided")
+        if self.group.get_oids():
+            if oid and oid not in self.group.get_oids():
+                self.errors.append(f"rule defines an OID {oid} that is missing from the MIB file({os.path.basename(MIB_FILE)})")
+
+    def _check_ascii(self):
+        if 'oid' not in self.labels:
+            return
+
+        desc = self.annotations.get('description', '')
+        summary = self.annotations.get('summary', '')
+        if not isascii(desc):
+            self.errors.append(f"non-ascii characters found in 'description' field will cause issues in associated snmp trap.")
+        if not isascii(summary):
+            self.errors.append(f"non-ascii characters found in 'summary' field will cause issues in associated snmp trap.")
  
      def validate(self):
+
          self._check_alert_name()
          self._check_structure()
          self._check_labels()
          self._check_annotations()
          self._check_doclink()
          self._check_snmp()
+        self._check_ascii()
          char = '.'
  
          if self.errors:
@@ -200,6 +255,9 @@ class RuleGroup:
      def fetch_html_page(self, url):
          return self.rule_file.fetch_html_page(url)
  
+    def get_oids(self):
+        return self.rule_file.oid_list
+
      @property
      def error_count(self):
          return len(self.problems['error'])
@@ -214,10 +272,11 @@ class RuleGroup:
  
  class RuleFile:
  
-    def __init__(self, parent, file_name, rules):
+    def __init__(self, parent, file_name, rules, oid_list):
          self.parent = parent
          self.file_name = file_name
          self.rules: Dict[str, Any] = rules
+        self.oid_list = oid_list
          self.problems: Set[str] = set()
          self.group: Dict[str, RuleGroup] = {}
          self.alert_names_seen: Set[str] = set()
@@ -242,10 +301,19 @@ class RuleFile:
      @property
      def rule_count(self):
          rule_count = 0
-        for group_name, rule_group in self.group.items():
+        for _group_name, rule_group in self.group.items():
              rule_count += rule_group.count
          return rule_count
  
+    @property
+    def oid_count(self):
+        oid_count = 0
+        for _group_name, rule_group in self.group.items():
+            for _rule_name, rule in rule_group.rules.items():
+                if rule.has_oid:
+                    oid_count += 1
+        return oid_count
+
      @property
      def group_names(self):
          return self.group.keys()
@@ -278,23 +346,23 @@ class RuleFile:
                      # skipped recording rule
                      pass
  
-    def error_report(self):
-        def max_width(item_list: List[str]) -> int:
-            return max([len(i) for i in item_list])
+    def report(self):
+        def max_width(item_list: Set[str], min_width: int = 0) -> int:
+            return max([len(i) for i in item_list] + [min_width])
  
          if not self.problems and not self.duplicate_alert_names:
-            print("\nNo problems detected in rule file")
+            print("\nNo problems detected in the rule file")
              return
  
          print("\nProblem Report\n")
  
-        group_width = max_width(self.problems)
+        group_width = max_width(self.problems, 5)
          alert_names = set()
          for g in self.problems:
              group = self.group[g]
              alert_names.update(group.problems.get('error', []))
              alert_names.update(group.problems.get('warning', []))
-        alert_width = max_width(alert_names)
+        alert_width = max_width(alert_names, 10)
  
          template = "  {group:<{group_width}}  {severity:<8}  {alert_name:<{alert_width}}  {description}"
  
@@ -382,7 +450,7 @@ class UnitTests:
      def process(self, defined_alert_names: List[str]):
          self._check_alert_names(defined_alert_names)
  
-    def error_report(self) -> None:
+    def report(self) -> None:
  
          if not self.problems:
              print("\nNo problems detected in unit tests file")
@@ -404,6 +472,26 @@ class RuleChecker:
          self.warnings = {}
          self.error_count = 0
          self.warning_count = 0
+        self.oid_count = 0
+
+        self.oid_list = self.build_oid_list()
+
+    def build_oid_list(self) -> List[str]:
+
+        cmd = shutil.which('snmptranslate')
+        if not cmd:
+            return []
+
+        rc, stdout, stderr = run_command(f"{cmd} -Pu -Tz -M ../../snmp:/usr/share/snmp/mibs -m CEPH-MIB")
+        if rc != 0:
+            return []
+
+        oid_list: List[str] = []
+        for line in stdout[:-1]:
+            _label, oid = line.replace('"', '').replace('\t', ' ').split()
+            oid_list.append(oid)
+
+        return oid_list
  
      @property
      def status(self):
@@ -448,36 +536,34 @@ class RuleChecker:
              print(errs)
              sys.exit(12)
  
-        self.rule_file = RuleFile(self, self.rules_filename, rules)
+        self.rule_file = RuleFile(self, self.rules_filename, rules, self.oid_list)
          self.summarise_rule_file()
  
          self.unit_tests = UnitTests(self.test_filename)
          self.unit_tests.process(self.rule_file.alert_names_seen)
  
-    def error_report(self):
+    def report(self):
          print("\n\nSummary\n")
          print(f"Rule file             : {self.rules_filename}")
          print(f"Unit Test file        : {self.test_filename}")
          print(f"\nRule groups processed : {self.rule_file.group_count:>3}")
          print(f"Rules processed       : {self.rule_file.rule_count:>3}")
+        print(f"SNMP OIDs declared    : {self.rule_file.oid_count:>3} {'(snmptranslate missing, unable to cross check)' if not self.oid_list else ''}")
          print(f"Rule errors           : {self.error_count:>3}")
          print(f"Rule warnings         : {self.warning_count:>3}")
          print(f"Rule name duplicates  : {len(self.rule_file.duplicate_alert_names):>3}")
          print(f"Unit tests missing    : {len(self.unit_tests.problems):>3}")
  
-        if self.rule_file_problems:
-            self.rule_file.error_report()
-        if self.unit_tests.problems:
-            self.unit_tests.error_report()
+        self.rule_file.report()
+        self.unit_tests.report()
  
  
  def main():
      checker = RuleChecker()
  
      checker.run()
-    if checker.status > 0:
-        checker.error_report()
-        print()
+    checker.report()
+    print()
  
      sys.exit(checker.status)
  
diff --git a/monitoring/snmp/CEPH-MIB.txt b/monitoring/snmp/CEPH-MIB.txt

new file mode 100644 (file)

index 0000000..f54cb36
--- /dev/null
+++ b/monitoring/snmp/CEPH-MIB.txt
@@ -0,0 +1,337 @@
+CEPH-MIB DEFINITIONS ::= BEGIN
+
+IMPORTS
+    MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises
+        FROM SNMPv2-SMI
+    MODULE-COMPLIANCE, NOTIFICATION-GROUP
+        FROM SNMPv2-CONF
+;
+
+-- Linting information:
+--
+-- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt
+--
+-- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1
+--
+
+ceph MODULE-IDENTITY
+    LAST-UPDATED
+        "202111010000Z" -- Nov 01, 2021
+    ORGANIZATION
+        "The Ceph Project
+         https://ceph.io"
+    CONTACT-INFO
+        "Email: <dev@ceph.io>
+
+        Send comments to: <dev@ceph.io>"
+    DESCRIPTION
+        "The MIB module for Ceph. In it's current form it only
+        supports Notifications, since Ceph itself doesn't provide
+        any SNMP agent functionality.
+
+        Notifications are provided through a Prometheus/Alertmanager
+        webhook passing alerts to an external gateway service that is
+        responsible for formatting, forwarding and authenticating to
+        the SNMP receiver.
+        "
+    REVISION
+        "202111010000Z" --Nov 01, 2021
+    DESCRIPTION
+        "Latest version including the following updates;
+
+        - MIB restructure to align with linting
+        - names shortened and simplified (less verbose)
+        - Simplified structure due to switch to https://github.com/maxwo/snmp_notifier
+          - objects removed
+          - notifications updated
+        - Added module compliance
+        - Updated to latest prometheus alert rule definitions
+        "
+    ::= { enterprises 50495 }
+
+cephCluster       OBJECT IDENTIFIER ::= { ceph 1 }
+cephConformance   OBJECT IDENTIFIER ::= { ceph 2 }
+
+-- cephMetadata is a placeholder for possible future expansion via an agent
+-- where we could provide an overview of the clusters configuration
+cephMetadata      OBJECT IDENTIFIER ::= { cephCluster 1 }
+cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 }
+
+prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 }
+
+--
+-- Notifications: first we define the notification 'branches' for the
+-- different categories of notifications / alerts
+promGeneric       OBJECT IDENTIFIER ::= { prometheus 1 }
+promHealthStatus  OBJECT IDENTIFIER ::= { prometheus 2 }
+promMon           OBJECT IDENTIFIER ::= { prometheus 3 }
+promOsd           OBJECT IDENTIFIER ::= { prometheus 4 }
+promMds           OBJECT IDENTIFIER ::= { prometheus 5 }
+promMgr           OBJECT IDENTIFIER ::= { prometheus 6 }
+promPGs           OBJECT IDENTIFIER ::= { prometheus 7 }
+promNode          OBJECT IDENTIFIER ::= { prometheus 8 }
+promPool          OBJECT IDENTIFIER ::= { prometheus 9 }
+promRados         OBJECT IDENTIFIER ::= { prometheus 10 }
+promCephadm       OBJECT IDENTIFIER ::= { prometheus 11 }
+promPrometheus    OBJECT IDENTIFIER ::= { prometheus 12 }
+
+promGenericNotification NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID."
+::= { promGeneric 1 }
+
+promGenericDaemonCrash NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived"
+::= { promGeneric 2 }
+
+promHealthStatusError NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph in health_error state for too long."
+::= { promHealthStatus 1 }
+
+promHealthStatusWarning NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph in health_warn for too long."
+::= { promHealthStatus 2 }
+
+promMonLowQuorum NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Monitor count in quorum is low."
+::= { promMon 1 }
+
+promMonDiskSpaceCritical NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Monitor diskspace is critically low."
+::= { promMon 2 }
+
+promOsdDownHigh NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A high number of OSDs are down."
+::= { promOsd 1 }
+
+promOsdDown NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more Osds down."
+::= { promOsd 2 }
+
+promOsdNearFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD is dangerously full."
+::= { promOsd 3 }
+
+promOsdFlapping NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
+::= { promOsd 4 }
+
+promOsdHighPgDeviation NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD deviates by more then 30% from average PG count."
+::= { promOsd 5 }
+
+promOsdFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD has reached its full threshold."
+::= { promOsd 6 }
+
+promOsdHighPredictedFailures NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail."
+::= { promOsd 7 }
+
+promOsdHostDown NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph OSD host is down."
+::= { promOsd 8 }
+
+promMdsDamaged NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem is damaged."
+::= { promMds 1 }
+
+promMdsReadOnly NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem marked as READ-ONLY"
+::= { promMds 2 }
+
+promMdsOffline NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem is unavailable/offline."
+::= { promMds 3 }
+
+promMdsDegraded NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem is in a degraded state."
+::= { promMds 4 }
+
+promMdsNoStandby NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs MDS daemon failure, no standby available"
+::= { promMds 5 }
+
+promMgrModuleCrash NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph mgr module has crashed recently"
+::= { promMgr 1 }
+
+promMgrPrometheusInactive NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph mgr prometheus module not responding"
+::= { promMgr 2 }
+
+promPGsInactive NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
+::= { promPGs 1 }
+
+promPGsUnclean NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
+::= { promPGs 2 }
+
+promPGsUnavailable NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects."
+::= { promPGs 3 }
+
+promPGsDamaged NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs is damaged."
+::= { promPGs 4 }
+
+promPGsRecoveryFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "PG recovery is impaired due to full OSDs."
+::= { promPGs 5 }
+
+promPGsBackfillFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "PG backfill is impaired due to full OSDs."
+::= { promPGs 6 }
+
+promNodeRootVolumeFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
+::= { promNode 1 }
+
+promNodeNetworkPacketDrops NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
+::= { promNode 2 }
+
+promNodeNetworkPacketErrors NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
+::= { promNode 3 }
+
+promNodeStorageFilling NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
+::= { promNode 4 }
+
+promPoolFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A pool is at 90% capacity or over."
+::= { promPool 1 }
+
+promPoolFilling NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
+::= { promPool 2 }
+
+promRadosUnfound NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
+::= { promRados 1 }
+
+promCephadmDaemonDown NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephadm has determined that a daemon is down."
+::= { promCephadm 1 }
+
+promCephadmUpgradeFailure NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem."
+::= { promCephadm 2 }
+
+promPrometheusJobMissing NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "The prometheus scrape job is not defined."
+::= { promPrometheus 1 }
+-- ---------------------------------------------------------- --
+-- IEEE 802.1D MIB - Conformance Information
+-- ---------------------------------------------------------- --
+
+cephAlertGroups   OBJECT IDENTIFIER ::= { cephConformance 1 }
+cephCompliances   OBJECT IDENTIFIER ::= { cephConformance 2 }
+
+-- ---------------------------------------------------------- --
+-- units of conformance
+-- ---------------------------------------------------------- --
+
+-- ---------------------------------------------------------- --
+-- The Trap Notification Group
+-- ---------------------------------------------------------- --
+
+cephNotificationGroup NOTIFICATION-GROUP
+    NOTIFICATIONS {
+        promGenericNotification,
+        promGenericDaemonCrash,
+        promHealthStatusError,
+        promHealthStatusWarning,
+        promMonLowQuorum,
+        promMonDiskSpaceCritical,
+        promOsdDownHigh,
+        promOsdDown,
+        promOsdNearFull,
+        promOsdFlapping,
+        promOsdHighPgDeviation,
+        promOsdFull,
+        promOsdHighPredictedFailures,
+        promOsdHostDown,
+        promMdsDamaged,
+        promMdsReadOnly,
+        promMdsOffline,
+        promMdsDegraded,
+        promMdsNoStandby,
+        promMgrModuleCrash,
+        promMgrPrometheusInactive,
+        promPGsInactive,
+        promPGsUnclean,
+        promPGsUnavailable,
+        promPGsDamaged,
+        promPGsRecoveryFull,
+        promPGsBackfillFull,
+        promNodeRootVolumeFull,
+        promNodeNetworkPacketDrops,
+        promNodeNetworkPacketErrors,
+        promNodeStorageFilling,
+        promPoolFull,
+        promPoolFilling,
+        promRadosUnfound,
+        promCephadmDaemonDown,
+        promCephadmUpgradeFailure,
+        promPrometheusJobMissing
+    }
+    STATUS current
+    DESCRIPTION
+        "A collection of notifications triggered by the Prometheus
+        rules to convey Ceph cluster state"
+    ::= { cephAlertGroups 2 }
+
+-- ---------------------------------------------------------- --
+-- compliance statements
+-- ---------------------------------------------------------- --
+
+cephCompliance MODULE-COMPLIANCE
+    STATUS current
+    DESCRIPTION
+        "The Compliance statement for the Ceph MIB"
+    MODULE
+        MANDATORY-GROUPS {
+            cephNotificationGroup
+        }
+    ::= { cephCompliances 1 }
+
+END
diff --git a/monitoring/snmp/README.md b/monitoring/snmp/README.md

index dccef1908f89dd1189e1af747700115317db2f12..1a5b609556df42a9ae0aaf42a0e1355a5c3cc3b8 100644 (file)
--- a/monitoring/snmp/README.md
+++ b/monitoring/snmp/README.md
@@ -1,24 +1,54 @@
  # SNMP schema
+To show the [OID](https://en.wikipedia.org/wiki/Object_identifier)'s supported by the MIB, use the snmptranslate command. Here's an example:
+```
+snmptranslate -Pu -Tz -M ~/git/ceph/monitoring/snmp:/usr/share/snmp/mibs -m CEPH-MIB
+```
+*The `snmptranslate` command is in the net-snmp-utils package*
  
-## Traps
+The MIB provides a NOTIFICATION only implementation since ceph doesn't have an SNMP
+agent feature.
  
-| OID | Description |
-| :--- | :--- |
-| 1.3.6.1.4.1.50495.15.1.2.1 | The default trap. This is used if no OID is specified in the alert labels. |
-| 1.3.6.1.4.1.50495.15.1.2.[2...N] | Custom traps. |
+## Integration
+The SNMP MIB is has been aligned to the Prometheus rules. Any rule that defines a 
+critical alert should have a corresponding oid in the CEPH-MIB.txt file. To generate
+an SNMP notification, you must use an SNMP gateway that the Prometheus Alertmanager
+service can forward alerts through to, via it's webhooks feature.
  
-## Objects
+&nbsp;
  
-The following objects are appended as variable binds to an SNMP trap.
+## SNMP Gateway
+The recommended SNMP gateway is https://github.com/maxwo/snmp_notifier. This is a widely
+used and generic SNMP gateway implementation written in go. It's usage (syntax and
+parameters) is very similar to Prometheus, AlertManager and even node-exporter.
  
-| OID | Type | Description |
-| :--- | :---: | :--- |
-| 1.3.6.1.4.1.50495.15.1.1.1 | String | The name of the Prometheus alert. |
-| 1.3.6.1.4.1.50495.15.1.1.2 | String | The status of the Prometheus alert. |
-| 1.3.6.1.4.1.50495.15.1.1.3 | String | The severity of the Prometheus alert. |
-| 1.3.6.1.4.1.50495.15.1.1.4 | String | Unique identifier for the Prometheus instance. |
-| 1.3.6.1.4.1.50495.15.1.1.5 | String | The name of the Prometheus job. |
-| 1.3.6.1.4.1.50495.15.1.1.6 | String | The Prometheus alert description field. |
-| 1.3.6.1.4.1.50495.15.1.1.7 | String | Additional Prometheus alert labels as JSON string. |
-| 1.3.6.1.4.1.50495.15.1.1.8 | Unix timestamp | The time when the Prometheus alert occurred. |
-| 1.3.6.1.4.1.50495.15.1.1.9 | String | The raw Prometheus alert as JSON string. |
-\ No newline at end of file
+&nbsp;
+## SNMP OIDs
+The main components of the Ceph MIB is can be broken down into discrete areas
+
+
+```
+internet private enterprise   ceph   ceph    Notifications   Prometheus  Notification
+                               org  cluster   (alerts)         source      Category
+1.3.6.1   .4     .1          .50495   .1        .2               .1         .2  (Ceph Health)
+                                                                            .3  (MON)
+                                                                            .4  (OSD)
+                                                                            .5  (MDS)
+                                                                            .6  (MGR)
+                                                                            .7  (PGs)
+                                                                            .8  (Nodes)
+                                                                            .9  (Pools)
+                                                                            .10  (Rados)
+                                                                            .11 (cephadm)
+                                                                            .12 (prometheus)
+
+```
+Individual alerts are placed within the appropriate alert category. For example, to add
+a notification relating to a MGR issue, you would use the oid 1.3.6.1.4.1.50495.1.2.1.6.x
+
+The SNMP gateway also adds additional components to the SNMP notification ;
+
+| Suffix | Description |
+|--------|-------------|
+| .1 | The oid |
+| .2 | Severity of the alert. When an alert is resolved, severity is 'info', and the description is set to Status:OK|
+| .3 | Text of the alert(s) |
author	Paul Cuzner <pcuzner@redhat.com>
	Wed, 3 Nov 2021 02:24:20 +0000 (15:24 +1300)
committer	Paul Cuzner <pcuzner@redhat.com>
	Thu, 4 Nov 2021 22:24:25 +0000 (11:24 +1300)
monitoring/prometheus/alerts/ceph_default_alerts.yml		patch \| blob \| history
monitoring/prometheus/tests/test_alerts.yml		patch \| blob \| history
monitoring/prometheus/tests/validate_rules.py		patch \| blob \| history
monitoring/snmp/CEPH-MIB.txt	[new file with mode: 0644]	patch \| blob
monitoring/snmp/README.md		patch \| blob \| history