]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/prometheus: track individual healthchecks as metrics
authorPaul Cuzner <pcuzner@redhat.com>
Thu, 16 Sep 2021 23:24:29 +0000 (11:24 +1200)
committerAvan Thakkar <athakkar@redhat.com>
Thu, 6 Jan 2022 09:47:47 +0000 (15:17 +0530)
This patch creates a health history object maintained in
the modules kvstore.  The history and current health
checks are used to create a metric per healthcheck whilst
also providing a history feature. Two new commands are added:
ceph healthcheck history ls
ceph healthcheck history clear

In addition to the new commands, the additional metrics
have been used to update the prometheus alerts

Fixes: https://tracker.ceph.com/issues/52638
Signed-off-by: Paul Cuzner <pcuzner@redhat.com>
(cherry picked from commit e0dfc02063ef40cf6a1dc6e3080d0a856ceff050)

 Conflicts:
doc/mgr/prometheus.rst
     - Adopting doc with master.

doc/mgr/prometheus.rst
monitoring/prometheus/alerts/ceph_default_alerts.yml
monitoring/prometheus/alerts/test_alerts.yml
src/pybind/mgr/prometheus/module.py

index 0f1caff2353ffba1fd4fe6f30b72084eb5b0c925..fc78795a3ea78ce0232715f8ed050ae2d2c07b4b 100644 (file)
@@ -45,9 +45,8 @@ is registered with Prometheus's `registry
 
     The ``scrape_interval`` of this module should always be set to match
     Prometheus' scrape interval to work properly and not cause any issues.
-    
-The Prometheus manager module is, by default, configured with a scrape interval
-of 15 seconds.  The scrape interval in the module is used for caching purposes
+
+The scrape interval in the module is used for caching purposes
 and to determine when a cache is stale.
 
 It is not recommended to use a scrape interval below 10 seconds.  It is
@@ -90,6 +89,43 @@ If you are confident that you don't require the cache, you can disable it::
 
 .. _prometheus-rbd-io-statistics:
 
+Ceph Health Checks
+------------------
+
+The mgr/prometheus module also tracks and maintains a history of Ceph health checks,
+exposing them to the Prometheus server as discrete metrics. This allows Prometheus
+alert rules to be configured for specific health check events.
+
+The metrics take the following form;
+
+::
+
+    # HELP ceph_health_detail healthcheck status by type (0=inactive, 1=active)
+    # TYPE ceph_health_detail gauge
+    ceph_health_detail{name="OSDMAP_FLAGS",severity="HEALTH_WARN"} 0.0
+    ceph_health_detail{name="OSD_DOWN",severity="HEALTH_WARN"} 1.0
+    ceph_health_detail{name="PG_DEGRADED",severity="HEALTH_WARN"} 1.0
+
+The health check history is made available through the following commands;
+
+::
+
+    healthcheck history ls [--format {plain|json|json-pretty}]
+    healthcheck history clear
+
+The ``ls`` command provides an overview of the health checks that the cluster has
+encountered, or since the last ``clear`` command was issued. The example below;
+
+::
+
+    [ceph: root@c8-node1 /]# ceph healthcheck history ls
+    Healthcheck Name          First Seen (UTC)      Last seen (UTC)       Count  Active
+    OSDMAP_FLAGS              2021/09/16 03:17:47   2021/09/16 22:07:40       2    No
+    OSD_DOWN                  2021/09/17 00:11:59   2021/09/17 00:11:59       1   Yes
+    PG_DEGRADED               2021/09/17 00:11:59   2021/09/17 00:11:59       1   Yes
+    3 health check(s) listed
+
+
 RBD IO statistics
 -----------------
 
@@ -303,8 +339,8 @@ node_targets.yml
 Notes
 =====
 
-Counters and gauges are exported; currently histograms and long-running 
-averages are not.  It's possible that Ceph's 2-D histograms could be 
+Counters and gauges are exported; currently histograms and long-running
+averages are not.  It's possible that Ceph's 2-D histograms could be
 reduced to two separate 1-D histograms, and that long-running averages
 could be exported as Prometheus' Summary type.
 
index 71fc864cddf7db4db38327eb199cf22fcb13eb5c..420472d35eebfc6b87e6984fe6b4c35030031e86 100644 (file)
@@ -27,22 +27,86 @@ groups:
 
   - name: mon
     rules:
-      - alert: low monitor quorum count
-        expr: sum(ceph_mon_quorum_status) < 3
+      - alert: Monitor down, quorum is at risk
+        expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
+        for: 30s
         labels:
           severity: critical
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.3.1
         annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down
           description: |
-            Monitor count in quorum is below three.
+            {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active
+            Without quorum the cluster will become inoperable, affecting all connected clients and services.
 
-            Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
+            The following monitors are down:
+            {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
+              - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+            {{- end }}
+      - alert: Monitor down
+        expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
+        for: 30s
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down
+          description: |
+            {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down.
+            Quorum is still intact, but the loss of further monitors will make your cluster inoperable.
 
             The following monitors are down:
             {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
             {{- end }}
+      - alert: Ceph mon disk space critically low
+        expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-crit
+          description: |
+            The free space available to a monitor's store is critically low (<5% by default).
+            You should increase the space available to the monitor(s). The
+            default location for the store sits under /var/lib/ceph. Your monitor hosts are;
+            {{- range query "ceph_mon_metadata"}}
+              - {{ .Labels.hostname }}
+            {{- end }}
+
+      - alert: Ceph mon disk space running low
+        expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-low
+          description: |
+            The space available to a monitor's store is approaching full (>70% is the default).
+            You should increase the space available to the monitor store. The
+            default location for the store sits under /var/lib/ceph. Your monitor hosts are;
+            {{- range query "ceph_mon_metadata"}}
+              - {{ .Labels.hostname }}
+            {{- end }}
+
+      - alert: Clock skew detected across Ceph Monitor daemons
+        expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-clock-skew
+          description: |
+            The ceph monitors rely on a consistent time reference to maintain
+            quorum and cluster consistency. This event indicates that at least
+            one of your mons is not sync'd correctly.
+
+            Review the cluster status with ceph -s. This will show which monitors
+            are affected. Check the time sync status on each monitor host.
 
   - name: osd
     rules:
@@ -60,43 +124,163 @@ groups:
             {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
             {{- end }}
-
+      - alert: OSD Host is down
+        expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: |
+            The following OSDs are down:
+            {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
+            - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }}
+            {{- end }}
       - alert: OSD down
-        expr: count(ceph_osd_up == 0) > 0
-        for: 15m
+        expr: ceph_health_detail{name="OSD_DOWN"} == 1
+        for: 5m
         labels:
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.4.2
         annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-down
           description: |
-            {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
-            {{ $value }} OSD{{ $s }} down for more than 15 minutes.
-
-            {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
+            {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins.
 
             The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
               {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
-                - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+              - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
               {{- end }}
 
       - alert: OSDs near full
-        expr: |
-          (
-            ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
-            * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
-          ) * 100 > 90
+        expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
         for: 5m
         labels:
-          severity: critical
+          severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.4.3
         annotations:
-          description: >
-            OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
-            dangerously full: {{ $value | humanize }}%
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-nearfull
+          description: |
+            One or more OSDs have reached their NEARFULL threshold
+
+            Use 'ceph health detail' to identify which OSDs have reached this threshold.
+            To resolve, either add capacity to the cluster, or delete unwanted data
+      - alert: OSD Full
+        expr: ceph_health_detail{name="OSD_FULL"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-full
+          description: |
+            An OSD has reached it's full threshold. Writes from all pools that share the
+            affected OSD will be blocked.
+
+            To resolve, either add capacity to the cluster, or delete unwanted data
+      - alert: OSD unable to perform rebalance
+        expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-backfillfull
+          description: |
+            An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations
+            completing for some pools. Check the current capacity utilisation with 'ceph df'
 
-      - alert: flapping OSD
+            To resolve, either add capacity to the cluster, or delete unwanted data
+      - alert: OSD too many read repairs
+        expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
+        for: 30s
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-too-many-repairs
+          description: |
+            Reads from an OSD have used a secondary PG to return data to the client, indicating
+            a potential failing disk.
+      - alert: OSD hearbeats running slow (frontend)
+        expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: |
+            OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
+            for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
+      - alert: OSD hearbeats running slow (backend)
+        expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: |
+            OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
+            for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
+      - alert: OSD disk size mismatch
+        expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-disk-size-mismatch
+          description: |
+            One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata.
+            This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs.
+      - alert: Device failure predicted
+        expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#id2
+          description: |
+            The device health module has determined that one or more devices will fail
+            soon. To review the device states use 'ceph device ls'. To show a specific
+            device use 'ceph device info <dev id>'.
+
+            Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once
+            the osd is empty remove and replace the OSD.
+      - alert: Too many devices predicted to fail
+        expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-toomany
+          description: |
+            The device health module has determined that the number of devices predicted to
+            fail can not be remediated automatically, since it would take too many osd's out of
+            the cluster, impacting performance and potentially availabililty. You should add new
+            OSDs to the cluster to allow data to be relocated to avoid the data integrity issues.
+      - alert: Device failure predicted, but automatic drain is incomplete
+        expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-in-use
+          description: |
+            The device health module has determined that one or more devices will fail
+            soon, but the normal process of relocating the data on the device to other
+            OSDs in the cluster is blocked.
+
+            Check the the cluster has available freespace. It may be necessary to add
+            more disks to the cluster to allow the data from the failing device to
+            successfully migrate.
+
+      - alert: Flapping OSD
         expr: |
           (
             rate(ceph_osd_up[5m])
@@ -107,11 +291,25 @@ groups:
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.15.1.2.4.4
         annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd/#flapping-osds
           description: >
             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
             marked down and back up at {{ $value | humanize }} times once a
-            minute for 5 minutes.
+            minute for 5 minutes. This could indicate a network issue (latency,
+            packet drop, disruption) on the clusters "cluster network". Check the
+            network environment on the listed host(s).
 
+      - alert: OSD Read errors
+        expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
+        for: 30s
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-spurious-read-errors
+          description: >
+            An OSD has encountered read errors, but the OSD has recovered by retrying
+            the reads. This may indicate an issue with the Hardware or Kernel.
       # alert on high deviation from average PG count
       - alert: high pg count deviation
         expr: |
@@ -130,12 +328,69 @@ groups:
             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
             by more than 30% from average PG count.
       # alert on high commit latency...but how high is too high
+
   - name: mds
     rules:
-    # no mds metrics are exported yet
+      - alert: Ceph Filesystem damage detected
+        expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages
+          description: >
+            The filesystems metadata has been corrupted. Data access
+            may be blocked.
+
+            Either analyse the output from the mds daemon admin socket, or
+            escalate to support
+      - alert: Ceph Filesystem switched to READ ONLY
+        expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages
+          description: >
+            The filesystem has switched to READ ONLY due to an unexpected
+            write error, when writing to the metadata pool
+
+            Either analyse the output from the mds daemon admin socket, or
+            escalate to support
+
   - name: mgr
     rules:
-    # no mgr metrics are exported yet
+      - alert: mgr module failure
+        expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-mgr-module-crash
+          description: >
+            One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A
+            crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to
+            investigate which module has failed, and archive it to acknowledge the failure.
+      - alert: mgr prometheus module is not active
+        expr: up{job="ceph"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: >
+            The mgr/prometheus module at {{ $labels.instance }} is unreachable. This
+            could mean that the module has been disabled or the mgr itself is down.
+
+            Without the mgr/prometheus module metrics and alerts will no longer
+            function. Open a shell to ceph and use 'ceph -s' to to determine whether the
+            mgr is active. If the mgr is not active, restart it, otherwise you can check
+            the mgr/prometheus module is loaded with 'ceph mgr module ls'  and if it's
+            not listed as enabled, enable it with 'ceph mgr module enable prometheus'
+
   - name: pgs
     rules:
       - alert: pgs inactive
@@ -160,8 +415,89 @@ groups:
         annotations:
           description: >
             {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
-            Unclean PGs haven't been able to completely recover from a
-            previous failure.
+            Unclean PGs haven't been able to completely recover from a previous failure.
+      - alert: Placement Group (PG) damaged
+        expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-damaged
+          description: >
+            During data consistency checks (scrub), at least one PG has been flagged as being
+            damaged or inconsistent.
+
+            Check to see which PG is affected, and attempt a manual repair if neccessary. To list
+            problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use
+            the 'ceph pg repair <pg_num>' command.
+      - alert: Recovery at risk, cluster too full
+        expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-recovery-full
+          description: >
+            Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their
+            'full' threshold. Add more capacity to the cluster, or delete unwanted data.
+      - alert: I/O blocked to some data
+        # PG_AVAILABILITY, but an OSD is not in a DOWN state
+        expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-availability
+          description: >
+            Data availability is reduced impacting the clusters abilty to service I/O to some data. One or
+            more placement groups (PGs) are in a state that blocks IO.
+      - alert: Cluster too full, automatic data recovery impaired
+        expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-backfill-full
+          description: >
+            Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
+            have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
+      - alert: Placement Group(s) have not been scrubbed
+        expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-scrubbed
+          description: |
+            One or more PGs have not been scrubbed recently. The scrub process is a data integrity
+            feature, protectng against bit-rot. It checks that objects and their metadata (size and
+            attributes) match across object replicas. When PGs miss their scrub window, it may
+            indicate the scrub window is too small, or PGs were not in a 'clean' state during the
+            scrub window.
+
+            You can manually initiate a scrub with: ceph pg scrub <pgid>
+      - alert: Placement Group(s) have not been 'DEEP' scrubbed
+        expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
+        for: 5m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-deep-scrubbed
+          description: |
+            One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity
+            feature, protectng against bit-rot. It compares the contents of objects and their
+            replicas for inconsistency. When PGs miss their deep scrub window, it may indicate
+            that the window is too small or PGs were not in a 'clean' state during the deep-scrub
+            window.
+
+            You can manually initiate a deep scrub with: ceph pg deep-scrub <pgid>
+
   - name: nodes
     rules:
       - alert: root volume full
@@ -218,9 +554,11 @@ groups:
             Node {{ $labels.instance }} experiences packet errors > 0.01% or
             > 10 packets/s on interface {{ $labels.device }}.
 
+      # Restrict to device names beginning with '/' to skip false alarms from
+      # tmpfs, overlay type filesystems
       - alert: storage filling up
         expr: |
-          predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
+          predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
           on(instance) group_left(nodename) node_uname_info < 0
         labels:
           severity: warning
@@ -256,7 +594,7 @@ groups:
         annotations:
           description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
 
-      - alert: pool filling up
+      - alert: pool filling up (growth forecast)
         expr: |
           (
             predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
@@ -271,6 +609,51 @@ groups:
             Pool {{ $labels.name }} will be full in less than 5 days
             assuming the average fill-up rate of the past 48 hours.
 
+      - alert: Ceph pool is too full for recovery/rebalance
+        expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            A pool is approaching it's near full threshold, which will
+            prevent rebalance operations from completing. You should
+            consider adding more capacity to the pool.
+
+      - alert: Ceph pool is full - writes blocked
+        expr: ceph_health_detail{name="POOL_FULL"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pool-full
+          description: |
+            A pool has reached it's MAX quota, or the OSDs supporting the pool
+            have reached their FULL threshold. Until this is resolved, writes to
+            the pool will be blocked.
+
+            Determine the affected pool with 'ceph df detail', for example looking
+            at QUOTA BYTES and STORED. Either increase the pools quota, or add
+            capacity to the cluster first then increase it's quota
+            (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
+      - alert: Ceph pool is approaching full
+        expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
+        for: 5m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: |
+            A pool has exceeeded it warning (percent full) threshold, or the OSDs
+            supporting the pool have reached their NEARFULL thresholds. Writes may
+            continue, but you are at risk of the pool going read only if more capacity
+            isn't made available.
+
+            Determine the affected pool with 'ceph df detail', for example looking
+            at QUOTA BYTES and STORED. Either increase the pools quota, or add
+            capacity to the cluster first then increase it's quota
+            (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
   - name: healthchecks
     rules:
       - alert: Slow OSD Ops
@@ -280,5 +663,76 @@ groups:
           severity: warning
           type: ceph_default
         annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#slow-ops
           description: >
             {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
+# cephadm alerts
+  - name: cephadm
+    rules:
+      - alert: Cluster upgrade has failed
+        expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
+        for: 30s
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: >
+            The cephadm cluster upgrade process has failed. The cluster remains in
+            an undetermined state.
+
+            Please review the cephadm logs, to understand the nature of the issue
+      - alert: A daemon managed by cephadm is down
+        expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
+        for: 30s
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: >
+            A daemon managed by cephadm is no longer active. Determine, which
+            daemon is down with 'ceph health detail'. you may start daemons with
+            the 'ceph orch daemon start <daemon_id>'
+      - alert: cephadm management has been paused
+        expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
+        for: 1m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/cephadm/operations/#cephadm-paused
+          description: >
+            Cluster management has been paused manually. This will prevent the
+            orchestrator from service management and reconciliation. If this is
+            not intentional, resume cephadm operations with 'ceph orch resume'
+
+# prometheus alerts
+  - name: prometheus
+    rules:
+      - alert: Scrape job is missing
+        expr: absent(up{job="ceph"})
+        for: 30s
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: |
+            The prometheus job that scrapes from Ceph is no longer defined, this
+            will effectively mean you'll have no metrics or alerts for the cluster.
+
+            Please review the job definitions in the prometheus.yml file of the prometheus
+            instance.
+# Object related events
+  - name: rados
+    rules:
+      - alert: Data not found/missing
+        expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
+        for: 30s
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#object-unfound
+          description: |
+            A version of a RADOS object can not be found, even though all OSDs are up. I/O
+            requests for this object from clients will block (hang). Resolving this issue may
+            require the object to be rolled back to a prior version manually, and manually verified.
\ No newline at end of file
index 913c207339b1c7d73d250efd61617dae096310ca..1e855c0902b03cd759fc2027a19dc4f8ce22b275 100644 (file)
@@ -59,54 +59,54 @@ tests:
             Please check "ceph health detail" for more information.
 
  # low monitor quorum count
- - interval: 1m
-   input_series:
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283",
-      job="ceph"}'
-      values: '1 1 1 1 1'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283",
-      job="ceph"}'
-      values: '1 1 1 1 1'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283",
-      job="ceph"}'
-      values: '0 0 0 0 0'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version
-      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
-      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
-      public_addr="172.20.0.2",rank="0"}'
-      values: '1 1 1 1 1'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version
-      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
-      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
-      public_addr="172.20.0.2",rank="1"}'
-      values: '1 1 1 1 1'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version
-      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
-      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
-      public_addr="172.20.0.2",rank="2"}'
-      values: '1 1 1 1 1'
-   promql_expr_test:
-     - expr: sum(ceph_mon_quorum_status) < 3
-       eval_time: 1m
-       exp_samples:
-         - labels: '{}'
-           value: 2
-   alert_rule_test:
-    - eval_time: 1m
-      alertname: low monitor quorum count
-      exp_alerts:
-      - exp_labels:
-          oid: 1.3.6.1.4.1.50495.15.1.2.3.1
-          type: ceph_default
-          severity: critical
-        exp_annotations:
-          description: |
-            Monitor count in quorum is below three.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283",
+      job="ceph"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283",
+      job="ceph"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283",
+      job="ceph"}'
+      values: '0 0 0 0 0'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version
+      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+      public_addr="172.20.0.2",rank="0"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version
+      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+      public_addr="172.20.0.2",rank="1"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version
+      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+      public_addr="172.20.0.2",rank="2"}'
+      values: '1 1 1 1 1'
+   promql_expr_test:
+     - expr: sum(ceph_mon_quorum_status) < 3
+       eval_time: 1m
+       exp_samples:
+         - labels: '{}'
+           value: 2
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: low monitor quorum count
+      exp_alerts:
+      - exp_labels:
+          oid: 1.3.6.1.4.1.50495.15.1.2.3.1
+          type: ceph_default
+          severity: critical
+        exp_annotations:
+          description: |
+            Monitor count in quorum is below three.
 
-            Only 2 of 3 monitors are active.
+            Only 2 of 3 monitors are active.
 
-            The following monitors are down:
-              - mon.c on ceph
+            The following monitors are down:
+              - mon.c on ceph
 
 
  # 10% OSDs down
@@ -161,141 +161,141 @@ tests:
                - osd.1 on ceph
 
  # OSD down
- - interval: 1m
-   input_series:
-    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
-      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
-      values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
-      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
-      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
-      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
-      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-   promql_expr_test:
-     - expr: count(ceph_osd_up == 0) > 0
-       eval_time: 1m
-       exp_samples:
-         - labels: '{}'
-           value: 1
-   alert_rule_test:
-     - eval_time: 15m
-       alertname: OSD down
-       exp_alerts:
-       - exp_labels:
-           oid: 1.3.6.1.4.1.50495.15.1.2.4.2
-           type: ceph_default
-           severity: warning
-         exp_annotations:
-           description: |
+ - interval: 1m
+   input_series:
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+      values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: count(ceph_osd_up == 0) > 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 15m
+       alertname: OSD down
+       exp_alerts:
+       - exp_labels:
+           oid: 1.3.6.1.4.1.50495.15.1.2.4.2
+           type: ceph_default
+           severity: warning
+         exp_annotations:
+           description: |
 
-             1 OSD down for more than 15 minutes.
-
-             1 of 3 OSDs are down.
-
-             The following OSD is down:
-                 - osd.1 on ceph
-
-  # OSDs near full
- - interval: 1m
-   input_series:
-    - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283"
-      ,job="ceph"}'
-      values: '1076310016 1076310016 1076310016 1076310016 1076310016
-      1076310016'
-    - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283"
-      ,job="ceph"}'
-      values: '1076310016 1076310016 1076310016 1076310016 1076310016
-      1076310016'
-    - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283"
-      ,job="ceph"}'
-      values: '1076310016 1076310016 1076310016 1076310016 1076310016
-      100856561909.76'
-    - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283"
-      ,job="ceph"}'
-      values: '108447916032 108447916032 108447916032 108447916032 108447916032
-      108447916032'
-    - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283"
-      ,job="ceph"}'
-      values: '108447916032 108447916032 108447916032 108447916032 108447916032
-      108447916032'
-    - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283"
-      ,job="ceph"}'
-      values: '108447916032 108447916032 108447916032 108447916032 108447916032
-      108447916032'
-    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
-      values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
-      values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
-      values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
-      values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
-      values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
-      values: '1 1 1 1 1 1'
-   promql_expr_test:
-     - expr: |
-         (
-           ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon)
-           ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname)
-           ceph_osd_metadata
-         ) * 100 > 90
+#              1 OSD down for more than 15 minutes.
 
-       eval_time: 5m
-       exp_samples:
-         - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283",
-           job="ceph"}'
-           value: 9.3E+01
-   alert_rule_test:
-     - eval_time: 10m
-       alertname: OSDs near full
-       exp_alerts:
-       - exp_labels:
-           ceph_daemon: osd.2
-           hostname: ceph
-           instance: ceph:9283
-           job: ceph
-           oid: 1.3.6.1.4.1.50495.15.1.2.4.3
-           type: ceph_default
-           severity: critical
-         exp_annotations:
-           description: >
-             OSD osd.2 on ceph is dangerously full: 93%
+#              1 of 3 OSDs are down.
+
+#              The following OSD is down:
+#                  - osd.1 on ceph
+
+# OSDs near full
+#  - interval: 1m
+#    input_series:
+#     - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283"
+#       ,job="ceph"}'
+#       values: '1076310016 1076310016 1076310016 1076310016 1076310016
+#       1076310016'
+#     - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283"
+#       ,job="ceph"}'
+#       values: '1076310016 1076310016 1076310016 1076310016 1076310016
+#       1076310016'
+#     - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283"
+#       ,job="ceph"}'
+#       values: '1076310016 1076310016 1076310016 1076310016 1076310016
+#       100856561909.76'
+#     - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283"
+#       ,job="ceph"}'
+#       values: '108447916032 108447916032 108447916032 108447916032 108447916032
+#       108447916032'
+#     - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283"
+#       ,job="ceph"}'
+#       values: '108447916032 108447916032 108447916032 108447916032 108447916032
+#       108447916032'
+#     - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283"
+#       ,job="ceph"}'
+#       values: '108447916032 108447916032 108447916032 108447916032 108447916032
+#       108447916032'
+#     - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+#       values: '1 1 1 1 1 1'
+#     - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+#       values: '1 1 1 1 1 1'
+#     - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+#       values: '1 1 1 1 1 1'
+#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+#       ceph_version="ceph version 17.0.0-189-g3558fd72
+#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+#       public_addr="172.20.0.2"}'
+#       values: '1 1 1 1 1 1'
+#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+#       ceph_version="ceph version 17.0.0-189-g3558fd72
+#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+#       public_addr="172.20.0.2"}'
+#       values: '1 1 1 1 1 1'
+#     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+#       ceph_version="ceph version 17.0.0-189-g3558fd72
+#       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+#       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+#       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+#       public_addr="172.20.0.2"}'
+#       values: '1 1 1 1 1 1'
+#    promql_expr_test:
+#      - expr: |
+#          (
+#            ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon)
+#            ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname)
+#            ceph_osd_metadata
+#          ) * 100 > 90
+
+#        eval_time: 5m
+#        exp_samples:
+#          - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283",
+#            job="ceph"}'
+#            value: 9.3E+01
+#    alert_rule_test:
+#      - eval_time: 10m
+#        alertname: OSDs near full
+#        exp_alerts:
+#        - exp_labels:
+#            ceph_daemon: osd.2
+#            hostname: ceph
+#            instance: ceph:9283
+#            job: ceph
+#            oid: 1.3.6.1.4.1.50495.15.1.2.4.3
+#            type: ceph_default
+#            severity: critical
+#          exp_annotations:
+#            description: >
+#              OSD osd.2 on ceph is dangerously full: 93%
 
  # flapping OSD
  - interval: 1s
@@ -340,7 +340,7 @@ tests:
            value: 1.2200000000000001E+01
    alert_rule_test:
      - eval_time: 5m
-       alertname: flapping OSD
+       alertname: Flapping OSD
        exp_alerts:
        - exp_labels:
            ceph_daemon: osd.0
@@ -351,10 +351,13 @@ tests:
            severity: warning
            type: ceph_default
          exp_annotations:
+           documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd/#flapping-osds
            description: >
               OSD osd.0 on ceph was
-              marked down and back up at 20.1 times once a
-              minute for 5 minutes.
+              marked down and back up at 20.1 times once a minute for 5 minutes.
+              This could indicate a network issue (latency, packet drop, disruption)
+              on the clusters "cluster network". Check the network environment on the
+              listed host(s).
 
  # high pg count deviation
  - interval: 1m
@@ -694,7 +697,7 @@ tests:
       values: '0 0 0 0 0'
     - series: 'node_network_up{device="eth4",instance="node-exporter",
       job="node-exporter"}'
-      values: '1 1 1 1 1'  
+      values: '1 1 1 1 1'
    promql_expr_test:
      - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left()
              (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
@@ -792,6 +795,1012 @@ tests:
            severity: warning
            type: ceph_default
          exp_annotations:
+           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#slow-ops
            description: >
              1 OSD requests are taking too long to process
              (osd_op_complaint_time exceeded)
+
+# CEPHADM orchestrator alert triggers
+ - interval: 30s
+   input_series:
+    - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
+      values: '1+0x40'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Cluster upgrade has failed
+    - eval_time: 5m
+      alertname: Cluster upgrade has failed
+      exp_alerts:
+      - exp_labels:
+          name: UPGRADE_EXCEPTION
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          description: >
+            The cephadm cluster upgrade process has failed. The cluster remains in
+            an undetermined state.
+
+            Please review the cephadm logs, to understand the nature of the issue
+ - interval: 30s
+   input_series:
+    - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
+      values: '1+0x40'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: A daemon managed by cephadm is down
+    - eval_time: 5m
+      alertname: A daemon managed by cephadm is down
+      exp_alerts:
+      - exp_labels:
+          name: CEPHADM_FAILED_DAEMON
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          description: >
+            A daemon managed by cephadm is no longer active. Determine, which
+            daemon is down with 'ceph health detail'. you may start daemons with
+            the 'ceph orch daemon start <daemon_id>'
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: cephadm management has been paused
+    - eval_time: 5m
+      alertname: cephadm management has been paused
+      exp_alerts:
+      - exp_labels:
+          name: CEPHADM_PAUSED
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephadm/operations/#cephadm-paused
+          description: >
+            Cluster management has been paused manually. This will prevent the
+            orchestrator from service management and reconciliation. If this is
+            not intentional, resume cephadm operations with 'ceph orch resume'
+# MDS
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Ceph Filesystem damage detected
+    - eval_time: 5m
+      alertname: Ceph Filesystem damage detected
+      exp_alerts:
+      - exp_labels:
+          name: MDS_DAMAGE
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages
+          description: >
+            The filesystems metadata has been corrupted. Data access
+            may be blocked.
+
+            Either analyse the output from the mds daemon admin socket, or
+            escalate to support
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
+       eval_time: 2m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Ceph Filesystem switched to READ ONLY
+    - eval_time: 5m
+      alertname: Ceph Filesystem switched to READ ONLY
+      exp_alerts:
+      - exp_labels:
+          name: MDS_HEALTH_READ_ONLY
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages
+          description: >
+            The filesystem has switched to READ ONLY due to an unexpected
+            write error, when writing to the metadata pool
+
+            Either analyse the output from the mds daemon admin socket, or
+            escalate to support
+# MGR
+ - interval: 1m
+   input_series:
+    - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
+      values: '1+0x2 0+0x10'
+   promql_expr_test:
+     - expr: up{job="ceph"} == 0
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
+           value: 0
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: mgr prometheus module is not active
+    - eval_time: 10m
+      alertname: mgr prometheus module is not active
+      exp_alerts:
+      - exp_labels:
+          instance: ceph-mgr:9283
+          job: ceph
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          description: >
+            The mgr/prometheus module at ceph-mgr:9283 is unreachable. This
+            could mean that the module has been disabled or the mgr itself is down.
+
+            Without the mgr/prometheus module metrics and alerts will no longer
+            function. Open a shell to ceph and use 'ceph -s' to to determine whether the
+            mgr is active. If the mgr is not active, restart it, otherwise you can check
+            the mgr/prometheus module is loaded with 'ceph mgr module ls'  and if it's
+            not listed as enabled, enable it with 'ceph mgr module enable prometheus'
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: mgr module failure
+    - eval_time: 15m
+      alertname: mgr module failure
+      exp_alerts:
+      - exp_labels:
+          name: RECENT_MGR_MODULE_CRASH
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-mgr-module-crash
+          description: >
+            One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A
+            crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to
+            investigate which module has failed, and archive it to acknowledge the failure.
+# MON
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
+      values: '0+0x2 1+0x10'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
+      values: '1+0x13'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Ceph mon disk space critically low
+    - eval_time: 10m
+      alertname: Ceph mon disk space critically low
+      exp_alerts:
+      - exp_labels:
+          name: "MON_DISK_CRIT"
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-crit
+          description: |
+            The free space available to a monitor's store is critically low (<5% by default).
+            You should increase the space available to the monitor(s). The
+            default location for the store sits under /var/lib/ceph. Your monitor hosts are;
+              - ceph-mon-a
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
+      values: '0+0x2 1+0x10'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
+      values: '1+0x13'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Ceph mon disk space running low
+    - eval_time: 10m
+      alertname: Ceph mon disk space running low
+      exp_alerts:
+      - exp_labels:
+          name: "MON_DISK_LOW"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-low
+          description: |
+            The space available to a monitor's store is approaching full (>70% is the default).
+            You should increase the space available to the monitor store. The
+            default location for the store sits under /var/lib/ceph. Your monitor hosts are;
+              - ceph-mon-a
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Clock skew detected across Ceph Monitor daemons
+    - eval_time: 10m
+      alertname: Clock skew detected across Ceph Monitor daemons
+      exp_alerts:
+      - exp_labels:
+          name: "MON_CLOCK_SKEW"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-clock-skew
+          description: |
+            The ceph monitors rely on a consistent time reference to maintain
+            quorum and cluster consistency. This event indicates that at least
+            one of your mons is not sync'd correctly.
+
+            Review the cluster status with ceph -s. This will show which monitors
+            are affected. Check the time sync status on each monitor host.
+
+# Check 3 mons one down, quorum at risk
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="MON_DOWN"}'
+      values: '0+0x2 1+0x12'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
+      values: '1+0x2 0+0x12'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
+      values: '1+0x14'
+   promql_expr_test:
+     - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Monitor down, quorum is at risk
+      # shouldn't fire
+    - eval_time: 10m
+      alertname: Monitor down, quorum is at risk
+      exp_alerts:
+      - exp_labels:
+          severity: critical
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.3.1
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down
+          description: |
+            Quorum requires a majority of monitors (x 2) to be active
+            Without quorum the cluster will become inoperable, affecting all connected clients and services.
+
+            The following monitors are down:
+              - mon.c on ceph-mon-3
+# check 5 mons, 1 down - warning only
+ - interval: 1m
+   input_series:
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
+      values: '1+0x2 0+0x12'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
+      values: '1+0x14'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
+      values: '1+0x14'
+   promql_expr_test:
+     - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
+       eval_time: 3m
+       exp_samples:
+         - labels: '{}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Monitor down
+    - eval_time: 10m
+      alertname: Monitor down
+      exp_alerts:
+      - exp_labels:
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down
+          description: |
+            You have 1 monitor down.
+            Quorum is still intact, but the loss of further monitors will make your cluster inoperable.
+
+            The following monitors are down:
+              - mon.e on ceph-mon-5
+# Device Health
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Device failure predicted
+    - eval_time: 10m
+      alertname: Device failure predicted
+      exp_alerts:
+      - exp_labels:
+          name: "DEVICE_HEALTH"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#id2
+          description: |
+            The device health module has determined that one or more devices will fail
+            soon. To review the device states use 'ceph device ls'. To show a specific
+            device use 'ceph device info <dev id>'.
+
+            Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once
+            the osd is empty remove and replace the OSD.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Too many devices predicted to fail
+    - eval_time: 10m
+      alertname: Too many devices predicted to fail
+      exp_alerts:
+      - exp_labels:
+          name: "DEVICE_HEALTH_TOOMANY"
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-toomany
+          description: |
+            The device health module has determined that the number of devices predicted to
+            fail can not be remediated automatically, since it would take too many osd's out of
+            the cluster, impacting performance and potentially availabililty. You should add new
+            OSDs to the cluster to allow data to be relocated to avoid the data integrity issues.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Device failure predicted, but automatic drain is incomplete
+    - eval_time: 10m
+      alertname: Device failure predicted, but automatic drain is incomplete
+      exp_alerts:
+      - exp_labels:
+          name: "DEVICE_HEALTH_IN_USE"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-in-use
+          description: |
+            The device health module has determined that one or more devices will fail
+            soon, but the normal process of relocating the data on the device to other
+            OSDs in the cluster is blocked.
+
+            Check the the cluster has available freespace. It may be necessary to add
+            more disks to the cluster to allow the data from the failing device to
+            successfully migrate.
+# OSD
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
+      values: '0+0x2 1+0x10'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+      values: '1+0x2 0+0x10'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
+      values: '1+0x12'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD Host is down
+    - eval_time: 10m
+      alertname: OSD Host is down
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_HOST_DOWN"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          description: |
+            The following OSDs are down:
+            - ceph-osd-1 : osd.0
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
+           value: 0
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD hearbeats running slow (frontend)
+    - eval_time: 10m
+      alertname: OSD hearbeats running slow (frontend)
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_SLOW_PING_TIME_FRONT"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          description: |
+            OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
+            for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
+           value: 0
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD hearbeats running slow (backend)
+    - eval_time: 10m
+      alertname: OSD hearbeats running slow (backend)
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_SLOW_PING_TIME_BACK"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          description: |
+            OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
+            for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
+           value: 0
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD disk size mismatch
+    - eval_time: 10m
+      alertname: OSD disk size mismatch
+      exp_alerts:
+      - exp_labels:
+          name: "BLUESTORE_DISK_SIZE_MISMATCH"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-disk-size-mismatch
+          description: |
+            One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata.
+            This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs.
+ - interval: 30s
+   input_series:
+    - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD Read errors
+    - eval_time: 10m
+      alertname: OSD Read errors
+      exp_alerts:
+      - exp_labels:
+          name: "BLUESTORE_SPURIOUS_READ_ERRORS"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-spurious-read-errors
+          description: >
+            An OSD has encountered read errors, but the OSD has recovered by retrying
+            the reads. This may indicate an issue with the Hardware or Kernel.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_DOWN"}'
+      values: '0+0x2 1+0x10'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+      values: '1+0x12'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
+      values: '1+0x2 0+0x10'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
+      values: '1+0x12'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
+      values: '1+0x12'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
+      values: '1+0x12'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
+      values: '1+0x12'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_DOWN"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD down
+    - eval_time: 10m
+      alertname: OSD down
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_DOWN"
+          severity: warning
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.4.2
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-down
+          description: |
+            1 OSD down for over 5mins.
+
+            The following OSD is down:
+              - osd.1 on ceph-osd-2
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSDs near full
+    - eval_time: 10m
+      alertname: OSDs near full
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_NEARFULL"
+          severity: warning
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.4.3
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-nearfull
+          description: |
+            One or more OSDs have reached their NEARFULL threshold
+
+            Use 'ceph health detail' to identify which OSDs have reached this threshold.
+            To resolve, either add capacity to the cluster, or delete unwanted data
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD unable to perform rebalance
+    - eval_time: 10m
+      alertname: OSD unable to perform rebalance
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_BACKFILLFULL"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-backfillfull
+          description: |
+            An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations
+            completing for some pools. Check the current capacity utilisation with 'ceph df'
+
+            To resolve, either add capacity to the cluster, or delete unwanted data
+ - interval: 30s
+   input_series:
+    - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
+           value: 0
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: OSD too many read repairs
+    - eval_time: 10m
+      alertname: OSD too many read repairs
+      exp_alerts:
+      - exp_labels:
+          name: "OSD_TOO_MANY_REPAIRS"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-too-many-repairs
+          description: |
+            Reads from an OSD have used a secondary PG to return data to the client, indicating
+            a potential failing disk.
+# Pools
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Ceph pool is too full for recovery/rebalance
+    - eval_time: 5m
+      alertname: Ceph pool is too full for recovery/rebalance
+      exp_alerts:
+      - exp_labels:
+          name: "POOL_BACKFILLFULL"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          description: >
+            A pool is approaching it's near full threshold, which will
+            prevent rebalance operations from completing. You should
+            consider adding more capacity to the pool.
+
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="POOL_FULL"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="POOL_FULL"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Ceph pool is full - writes blocked
+    - eval_time: 10m
+      alertname: Ceph pool is full - writes blocked
+      exp_alerts:
+      - exp_labels:
+          name: "POOL_FULL"
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pool-full
+          description: |
+            A pool has reached it's MAX quota, or the OSDs supporting the pool
+            have reached their FULL threshold. Until this is resolved, writes to
+            the pool will be blocked.
+
+            Determine the affected pool with 'ceph df detail', for example looking
+            at QUOTA BYTES and STORED. Either increase the pools quota, or add
+            capacity to the cluster first then increase it's quota
+            (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Ceph pool is approaching full
+    - eval_time: 10m
+      alertname: Ceph pool is approaching full
+      exp_alerts:
+      - exp_labels:
+          name: "POOL_NEAR_FULL"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          description: |
+            A pool has exceeeded it warning (percent full) threshold, or the OSDs
+            supporting the pool have reached their NEARFULL thresholds. Writes may
+            continue, but you are at risk of the pool going read only if more capacity
+            isn't made available.
+
+            Determine the affected pool with 'ceph df detail', for example looking
+            at QUOTA BYTES and STORED. Either increase the pools quota, or add
+            capacity to the cluster first then increase it's quota
+            (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
+
+# PGs
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Placement Group(s) have not been scrubbed
+    - eval_time: 10m
+      alertname: Placement Group(s) have not been scrubbed
+      exp_alerts:
+      - exp_labels:
+          name: "PG_NOT_SCRUBBED"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-scrubbed
+          description: |
+            One or more PGs have not been scrubbed recently. The scrub process is a data integrity
+            feature, protectng against bit-rot. It checks that objects and their metadata (size and
+            attributes) match across object replicas. When PGs miss their scrub window, it may
+            indicate the scrub window is too small, or PGs were not in a 'clean' state during the
+            scrub window.
+
+            You can manually initiate a scrub with: ceph pg scrub <pgid>
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
+           value: 0
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Recovery at risk, cluster too full
+    - eval_time: 10m
+      alertname: Recovery at risk, cluster too full
+      exp_alerts:
+      - exp_labels:
+          name: "PG_RECOVERY_FULL"
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-recovery-full
+          description: >
+            Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their
+            'full' threshold. Add more capacity to the cluster, or delete unwanted data.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
+      values: '0+0x2 1+0x20'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
+           value: 0
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Cluster too full, automatic data recovery impaired
+    - eval_time: 10m
+      alertname: Cluster too full, automatic data recovery impaired
+      exp_alerts:
+      - exp_labels:
+          name: "PG_BACKFILL_FULL"
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-backfill-full
+          description: >
+            Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
+            have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
+      values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_health_detail{name="OSD_DOWN"}'
+      values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
+   promql_expr_test:
+     - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
+       eval_time: 1m
+       # empty set at 1m
+       exp_samples:
+   alert_rule_test:
+    # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
+    - eval_time: 1m
+      alertname: I/O blocked to some data
+      exp_alerts:
+    # PG_AVAILABILITY firing, but osd_down is active .. no alert
+    - eval_time: 5m
+      alertname: I/O blocked to some data
+      exp_alerts:
+    # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
+    - eval_time: 15m
+      alertname: I/O blocked to some data
+      exp_alerts:
+      - exp_labels:
+          name: "PG_AVAILABILITY"
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-availability
+          description: >
+            Data availability is reduced impacting the clusters abilty to service I/O to some data. One or
+            more placement groups (PGs) are in a state that blocks IO.
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
+      values: '0+0x2 1+0x10'
+   promql_expr_test:
+     - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
+       eval_time: 3m
+       exp_samples:
+         - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: Placement Group(s) have not been 'DEEP' scrubbed
+    - eval_time: 10m
+      alertname: Placement Group(s) have not been 'DEEP' scrubbed
+      exp_alerts:
+      - exp_labels:
+          name: "PG_NOT_DEEP_SCRUBBED"
+          severity: warning
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-deep-scrubbed
+          description: |
+            One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity
+            feature, protectng against bit-rot. It compares the contents of objects and their
+            replicas for inconsistency. When PGs miss their deep scrub window, it may indicate
+            that the window is too small or PGs were not in a 'clean' state during the deep-scrub
+            window.
+
+            You can manually initiate a deep scrub with: ceph pg deep-scrub <pgid>
+
+# Prometheus
+ - interval: 1m
+   input_series:
+    - series: 'up{job="myjob"}'
+      values: '1+0x10'
+   promql_expr_test:
+     - expr: absent(up{job="ceph"})
+       eval_time: 1m
+       exp_samples:
+         - labels: '{job="ceph"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 5m
+      alertname: Scrape job is missing
+      exp_alerts:
+      - exp_labels:
+          job: ceph
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          description: |
+            The prometheus job that scrapes from Ceph is no longer defined, this
+            will effectively mean you'll have no metrics or alerts for the cluster.
+
+            Please review the job definitions in the prometheus.yml file of the prometheus
+            instance.
+# RADOS
+ - interval: 1m
+   input_series:
+    - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
+      values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
+      values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
+       eval_time: 1m
+       exp_samples:
+   alert_rule_test:
+    # OBJECT_UNFOUND but osd.2 is down, so don't fire
+    - eval_time: 5m
+      alertname: Data not found/missing
+      exp_alerts:
+    # OBJECT_UNFOUND and all osd's are online, so fire
+    - eval_time: 15m
+      alertname: Data not found/missing
+      exp_alerts:
+      - exp_labels:
+          severity: critical
+          type: ceph_default
+        exp_annotations:
+          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#object-unfound
+          description: |
+            A version of a RADOS object can not be found, even though all OSDs are up. I/O
+            requests for this object from clients will block (hang). Resolving this issue may
+            require the object to be rolled back to a prior version manually, and manually verified.
\ No newline at end of file
index b626910bb6a69e554b844c043b7af5bcac691404..f8573b61cbaf3470b1cbb66eb7ebf7f49ecfe279 100644 (file)
@@ -7,10 +7,12 @@ import os
 import re
 import threading
 import time
-from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT
+import enum
+from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
 from mgr_util import get_default_addr, profile_method, build_url
 from rbd import RBD
 from collections import namedtuple
+import yaml
 
 from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List
 
@@ -115,6 +117,189 @@ HEALTH_CHECKS = [
     alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'),
 ]
 
+HEALTHCHECK_DETAIL = ('name', 'severity')
+
+
+class Severity(enum.Enum):
+    ok = "HEALTH_OK"
+    warn = "HEALTH_WARN"
+    error = "HEALTH_ERR"
+
+
+class Format(enum.Enum):
+    plain = 'plain'
+    json = 'json'
+    json_pretty = 'json-pretty'
+    yaml = 'yaml'
+
+
+class HealthCheckEvent:
+
+    def __init__(self, name: str, severity: Severity, first_seen: float, last_seen: float, count: int, active: bool = True):
+        self.name = name
+        self.severity = severity
+        self.first_seen = first_seen
+        self.last_seen = last_seen
+        self.count = count
+        self.active = active
+
+    def as_dict(self) -> Dict[str, Any]:
+        """Return the instance as a dictionary."""
+        return self.__dict__
+
+
+class HealthHistory:
+    kv_name = 'health_history'
+    titles = "{healthcheck_name:<24}  {first_seen:<20}  {last_seen:<20}  {count:>5}  {active:^6}"
+    date_format = "%Y/%m/%d %H:%M:%S"
+
+    def __init__(self, mgr: MgrModule):
+        self.mgr = mgr
+        self.lock = threading.Lock()
+        self.healthcheck: Dict[str, HealthCheckEvent] = {}
+        self._load()
+
+    def _load(self) -> None:
+        """Load the current state from the mons KV store."""
+        data = self.mgr.get_store(self.kv_name)
+        if data:
+            try:
+                healthcheck_data = json.loads(data)
+            except json.JSONDecodeError:
+                self.mgr.log.warn(
+                    f"INVALID data read from mgr/prometheus/{self.kv_name}. Resetting")
+                self.reset()
+                return
+            else:
+                for k, v in healthcheck_data.items():
+                    self.healthcheck[k] = HealthCheckEvent(
+                        name=k,
+                        severity=v.get('severity'),
+                        first_seen=v.get('first_seen', 0),
+                        last_seen=v.get('last_seen', 0),
+                        count=v.get('count', 1),
+                        active=v.get('active', True))
+        else:
+            self.reset()
+
+    def reset(self) -> None:
+        """Reset the healthcheck history."""
+        with self.lock:
+            self.mgr.set_store(self.kv_name, "{}")
+            self.healthcheck = {}
+
+    def save(self) -> None:
+        """Save the current in-memory healthcheck history to the KV store."""
+        with self.lock:
+            self.mgr.set_store(self.kv_name, self.as_json())
+
+    def check(self, health_checks: Dict[str, Any]) -> None:
+        """Look at the current health checks and compare existing the history.
+
+        Args:
+            health_checks (Dict[str, Any]): current health check data
+        """
+
+        current_checks = health_checks.get('checks', {})
+        changes_made = False
+
+        # first turn off any active states we're tracking
+        for seen_check in self.healthcheck:
+            check = self.healthcheck[seen_check]
+            if check.active and seen_check not in current_checks:
+                check.active = False
+                changes_made = True
+
+        # now look for any additions to track
+        now = time.time()
+        for name, info in current_checks.items():
+            if name not in self.healthcheck:
+                # this healthcheck is new, so start tracking it
+                changes_made = True
+                self.healthcheck[name] = HealthCheckEvent(
+                    name=name,
+                    severity=info.get('severity'),
+                    first_seen=now,
+                    last_seen=now,
+                    count=1,
+                    active=True
+                )
+            else:
+                # seen it before, so update its metadata
+                check = self.healthcheck[name]
+                if check.active:
+                    # check has been registered as active already, so skip
+                    continue
+                else:
+                    check.last_seen = now
+                    check.count += 1
+                    check.active = True
+                    changes_made = True
+
+        if changes_made:
+            self.save()
+
+    def __str__(self) -> str:
+        """Print the healthcheck history.
+
+        Returns:
+            str: Human readable representation of the healthcheck history
+        """
+        out = []
+
+        if len(self.healthcheck.keys()) == 0:
+            out.append("No healthchecks have been recorded")
+        else:
+            out.append(self.titles.format(
+                healthcheck_name="Healthcheck Name",
+                first_seen="First Seen (UTC)",
+                last_seen="Last seen (UTC)",
+                count="Count",
+                active="Active")
+            )
+            for k in sorted(self.healthcheck.keys()):
+                check = self.healthcheck[k]
+                out.append(self.titles.format(
+                    healthcheck_name=check.name,
+                    first_seen=time.strftime(self.date_format, time.localtime(check.first_seen)),
+                    last_seen=time.strftime(self.date_format, time.localtime(check.last_seen)),
+                    count=check.count,
+                    active="Yes" if check.active else "No")
+                )
+            out.extend([f"{len(self.healthcheck)} health check(s) listed", ""])
+
+        return "\n".join(out)
+
+    def as_dict(self) -> Dict[str, Any]:
+        """Return the history in a dictionary.
+
+        Returns:
+            Dict[str, Any]: dictionary indexed by the healthcheck name
+        """
+        return {name: self.healthcheck[name].as_dict() for name in self.healthcheck}
+
+    def as_json(self, pretty: bool = False) -> str:
+        """Return the healthcheck history object as a dict (JSON).
+
+        Args:
+            pretty (bool, optional): whether to json pretty print the history. Defaults to False.
+
+        Returns:
+            str: str representation of the healthcheck in JSON format
+        """
+        if pretty:
+            return json.dumps(self.as_dict(), indent=2)
+        else:
+            return json.dumps(self.as_dict())
+
+    def as_yaml(self) -> str:
+        """Return the healthcheck history in yaml format.
+
+        Returns:
+            str: YAML representation of the healthcheck history
+        """
+        return yaml.safe_dump(self.as_dict(), explicit_start=True, default_flow_style=False)
+
 
 class Metric(object):
     def __init__(self, mtype: str, name: str, desc: str, labels: Optional[Tuple[str, ...]] = None) -> None:
@@ -331,6 +516,7 @@ class Module(MgrModule):
         global _global_instance
         _global_instance = self
         self.metrics_thread = MetricCollectionThread(_global_instance)
+        self.health_history = HealthHistory(self)
 
     def _setup_static_metrics(self) -> Dict[str, Metric]:
         metrics = {}
@@ -432,6 +618,13 @@ class Module(MgrModule):
             ('pool_id',)
         )
 
+        metrics['health_detail'] = Metric(
+            'gauge',
+            'health_detail',
+            'healthcheck status by type (0=inactive, 1=active)',
+            HEALTHCHECK_DETAIL
+        )
+
         for flag in OSD_FLAGS:
             path = 'osd_flag_{}'.format(flag)
             metrics[path] = Metric(
@@ -521,7 +714,7 @@ class Module(MgrModule):
         )
 
         # Examine the health to see if any health checks triggered need to
-        # become a metric.
+        # become a specific metric with a value from the health detail
         active_healthchecks = health.get('checks', {})
         active_names = active_healthchecks.keys()
 
@@ -553,6 +746,15 @@ class Module(MgrModule):
                     # health check is not active, so give it a default of 0
                     self.metrics[path].set(0)
 
+        self.health_history.check(health)
+        for name, info in self.health_history.healthcheck.items():
+            v = 1 if info.active else 0
+            self.metrics['health_detail'].set(
+                v, (
+                    name,
+                    str(info.severity))
+            )
+
     @profile_method()
     def get_pool_stats(self) -> None:
         # retrieve pool stats to provide per pool recovery metrics
@@ -1420,6 +1622,37 @@ class Module(MgrModule):
         self.log.info('Stopping engine...')
         self.shutdown_event.set()
 
+    @CLIReadCommand('healthcheck history ls')
+    def _list_healthchecks(self, format: Format = Format.plain) -> HandleCommandResult:
+        """List all the healthchecks being tracked
+
+        The format options are parsed in ceph_argparse, before they get evaluated here so
+        we can safely assume that what we have to process is valid. ceph_argparse will throw
+        a ValueError if the cast to our Format class fails.
+
+        Args:
+            format (Format, optional): output format. Defaults to Format.plain.
+
+        Returns:
+            HandleCommandResult: return code, stdout and stderr returned to the caller
+        """
+
+        out = ""
+        if format == Format.plain:
+            out = str(self.health_history)
+        elif format == Format.yaml:
+            out = self.health_history.as_yaml()
+        else:
+            out = self.health_history.as_json(format == Format.json_pretty)
+
+        return HandleCommandResult(retval=0, stdout=out)
+
+    @CLIWriteCommand('healthcheck history clear')
+    def _clear_healthchecks(self) -> HandleCommandResult:
+        """Clear the healthcheck history"""
+        self.health_history.reset()
+        return HandleCommandResult(retval=0, stdout="healthcheck history cleared")
+
 
 class StandbyModule(MgrStandbyModule):
     def __init__(self, *args: Any, **kwargs: Any) -> None: