From: Nizamudeen A Date: Mon, 7 Feb 2022 10:53:29 +0000 (+0530) Subject: cephadm: change shared_folder directory for prometheus and grafana X-Git-Tag: v17.1.0~17^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=27c29946f32ecf1cb136fe6e08055501b92b8efb;p=ceph.git cephadm: change shared_folder directory for prometheus and grafana After https://github.com/ceph/ceph/pull/44059 the monitoring/prometheus and monitoring/grafana/dashboards directories are changed to monitoring/ceph-mixins. That broke the shared_folders in the cephadm bootstrap script. Changed all the instances of monitoring/prometheus and monitoring/grafana/dashboards to monitoring/ceph-mixins Also, renaming all the instances of prometheus_alerts.yaml to prometheus_alerts.yml. Fixes: https://tracker.ceph.com/issues/54176 Signed-off-by: Nizamudeen A (cherry picked from commit 27592b75618706194e668c40056d9bfc58c5a3c6) --- diff --git a/ceph.spec.in b/ceph.spec.in index e18c881783be..bb3c0553af5a 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -1432,7 +1432,7 @@ mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd-mirror # prometheus alerts -install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml +install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml %if 0%{?suse_version} # create __pycache__ directories and their contents diff --git a/debian/rules b/debian/rules index 0c3939745be6..fe54877e6db1 100755 --- a/debian/rules +++ b/debian/rules @@ -73,7 +73,7 @@ override_dh_auto_install: install -m 755 src/cephadm/cephadm $(DESTDIR)/usr/sbin/cephadm - install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml + install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml # doc/changelog is a directory, which confuses dh_installchangelogs override_dh_installchangelogs: diff --git a/doc/mgr/dashboard.rst b/doc/mgr/dashboard.rst index 44faefaea123..60fca266e3ec 100644 --- a/doc/mgr/dashboard.rst +++ b/doc/mgr/dashboard.rst @@ -534,14 +534,14 @@ on appropriate hosts, proceed with the following steps. Dashboards can be added to Grafana by importing dashboard JSON files. Use the following command to download the JSON files:: - wget https://raw.githubusercontent.com/ceph/ceph/master/monitoring/grafana/dashboards/.json + wget https://raw.githubusercontent.com/ceph/ceph/master/monitoring/ceph-mixin/dashboards_out/.json You can find various dashboard JSON files `here `_ . + master/monitoring/ceph-mixin/dashboards_out>`_ . For Example, for ceph-cluster overview you can use:: - wget https://raw.githubusercontent.com/ceph/ceph/master/monitoring/grafana/dashboards/ceph-cluster.json + wget https://raw.githubusercontent.com/ceph/ceph/master/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json You may also author your own dashboards. diff --git a/monitoring/ceph-mixin/README.md b/monitoring/ceph-mixin/README.md index 7c0cd064c67a..fd0fe95ed90b 100644 --- a/monitoring/ceph-mixin/README.md +++ b/monitoring/ceph-mixin/README.md @@ -2,7 +2,7 @@ A set of Grafana dashboards and Prometheus alerts for Ceph. All the Grafana dashboards are already generated in the `dashboards_out` -directory and alerts in the `prometheus_alerts.yaml` file. +directory and alerts in the `prometheus_alerts.yml` file. You can use the Grafana dashboards and alerts with Jsonnet like any other prometheus mixin. You can find more ressources about mixins in general on @@ -26,7 +26,7 @@ plugin](http://docs.ceph.com/en/latest/mgr/prometheus/) and the ### Prometheus alerts -In `prometheus_alerts.yaml` you'll find a set of Prometheus +In `prometheus_alerts.yml` you'll find a set of Prometheus alert rules that should provide a decent set of default alerts for a Ceph cluster. Just put this file in a place according to your Prometheus configuration (wherever the `rules` configuration stanza points). diff --git a/monitoring/ceph-mixin/alerts.libsonnet b/monitoring/ceph-mixin/alerts.libsonnet index 8671637de5d5..9c759938a051 100644 --- a/monitoring/ceph-mixin/alerts.libsonnet +++ b/monitoring/ceph-mixin/alerts.libsonnet @@ -1,3 +1,3 @@ { - prometheusAlerts+:: std.parseYaml(importstr 'prometheus_alerts.yaml'), + prometheusAlerts+:: std.parseYaml(importstr 'prometheus_alerts.yml'), } diff --git a/monitoring/ceph-mixin/prometheus_alerts.yaml b/monitoring/ceph-mixin/prometheus_alerts.yaml deleted file mode 100644 index fc38678f99dd..000000000000 --- a/monitoring/ceph-mixin/prometheus_alerts.yaml +++ /dev/null @@ -1,890 +0,0 @@ -groups: - - name: cluster health - rules: - - alert: CephHealthError - expr: ceph_health_status == 2 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.2.1 - annotations: - summary: Cluster is in an ERROR state - description: > - Ceph in HEALTH_ERROR state for more than 5 minutes. - Please check "ceph health detail" for more information. - - - alert: CephHealthWarning - expr: ceph_health_status == 1 - for: 15m - labels: - severity: warning - type: ceph_default - annotations: - summary: Cluster is in a WARNING state - description: > - Ceph has been in HEALTH_WARN for more than 15 minutes. - Please check "ceph health detail" for more information. - - - name: mon - rules: - - alert: CephMonDownQuorumAtRisk - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1 - for: 30s - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.3.1 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down - summary: Monitor quorum is at risk - description: | - {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active - Without quorum the cluster will become inoperable, affecting all connected clients and services. - - The following monitors are down: - {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - - alert: CephMonDown - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)) - for: 30s - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down - summary: One of more ceph monitors are down - description: | - {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. - Quorum is still intact, but the loss of further monitors will make your cluster inoperable. - - The following monitors are down: - {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - - alert: CephMonDiskspaceCritical - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.3.2 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit - summary: Disk space on at least one monitor is critically low - description: | - The free space available to a monitor's store is critically low (<5% by default). - You should increase the space available to the monitor(s). The - default location for the store sits under /var/lib/ceph. Your monitor hosts are; - {{- range query "ceph_mon_metadata"}} - - {{ .Labels.hostname }} - {{- end }} - - - alert: CephMonDiskspaceLow - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 - for: 5m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low - summary: Disk space on at least one monitor is approaching full - description: | - The space available to a monitor's store is approaching full (>70% is the default). - You should increase the space available to the monitor store. The - default location for the store sits under /var/lib/ceph. Your monitor hosts are; - {{- range query "ceph_mon_metadata"}} - - {{ .Labels.hostname }} - {{- end }} - - - alert: CephMonClockSkew - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew - summary: Clock skew across the Monitor hosts detected - description: | - The ceph monitors rely on a consistent time reference to maintain - quorum and cluster consistency. This event indicates that at least - one of your mons is not sync'd correctly. - - Review the cluster status with ceph -s. This will show which monitors - are affected. Check the time sync status on each monitor host. - - - name: osd - rules: - - alert: CephOSDDownHigh - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.1 - annotations: - summary: More than 10% of OSDs are down - description: | - {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). - - The following OSDs are down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - - alert: CephOSDHostDown - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 - for: 5m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.8 - annotations: - summary: An OSD host is offline - description: | - The following OSDs are down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} - {{- end }} - - alert: CephOSDDown - expr: ceph_health_detail{name="OSD_DOWN"} == 1 - for: 5m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.2 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down - summary: An OSD has been marked down/unavailable - description: | - {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. - - The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - - - alert: CephOSDNearFull - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 - for: 5m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.3 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull - summary: OSD(s) running low on free space (NEARFULL) - description: | - One or more OSDs have reached their NEARFULL threshold - - Use 'ceph health detail' to identify which OSDs have reached this threshold. - To resolve, either add capacity to the cluster, or delete unwanted data - - alert: CephOSDFull - expr: ceph_health_detail{name="OSD_FULL"} > 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.6 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full - summary: OSD(s) is full, writes blocked - description: | - An OSD has reached it's full threshold. Writes from all pools that share the - affected OSD will be blocked. - - To resolve, either add capacity to the cluster, or delete unwanted data - - alert: CephOSDBackfillFull - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull - summary: OSD(s) too full for backfill operations - description: | - An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations - completing for some pools. Check the current capacity utilisation with 'ceph df' - - To resolve, either add capacity to the cluster, or delete unwanted data - - alert: CephOSDTooManyRepairs - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 - for: 30s - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs - summary: OSD has hit a high number of read errors - description: | - Reads from an OSD have used a secondary PG to return data to the client, indicating - a potential failing disk. - - alert: CephOSDTimeoutsPublicNetwork - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - summary: Network issues delaying OSD heartbeats (public network) - description: | - OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network - for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. - - alert: CephOSDTimeoutsClusterNetwork - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - summary: Network issues delaying OSD heartbeats (cluster network) - description: | - OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network - for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. - - alert: CephOSDInternalDiskSizeMismatch - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch - summary: OSD size inconsistency error - description: | - One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata. - This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs. - - alert: CephDeviceFailurePredicted - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2 - summary: Device(s) have been predicted to fail soon - description: | - The device health module has determined that one or more devices will fail - soon. To review the device states use 'ceph device ls'. To show a specific - device use 'ceph device info '. - - Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once - the osd is empty remove and replace the OSD. - - alert: CephDeviceFailurePredictionTooHigh - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.7 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany - summary: Too many devices have been predicted to fail, unable to resolve - description: | - The device health module has determined that the number of devices predicted to - fail can not be remediated automatically, since it would take too many osd's out of - the cluster, impacting performance and potentially availabililty. You should add new - OSDs to the cluster to allow data to be relocated to avoid the data integrity issues. - - alert: CephDeviceFailureRelocationIncomplete - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use - summary: A device failure is predicted, but unable to relocate data - description: | - The device health module has determined that one or more devices will fail - soon, but the normal process of relocating the data on the device to other - OSDs in the cluster is blocked. - - Check the the cluster has available freespace. It may be necessary to add - more disks to the cluster to allow the data from the failing device to - successfully migrate. - - - alert: CephOSDFlapping - expr: | - ( - rate(ceph_osd_up[5m]) - * on(ceph_daemon) group_left(hostname) ceph_osd_metadata - ) * 60 > 1 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.4 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds - summary: Network issues are causing OSD's to flap (mark each other out) - description: > - OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was - marked down and back up at {{ $value | humanize }} times once a - minute for 5 minutes. This could indicate a network issue (latency, - packet drop, disruption) on the clusters "cluster network". Check the - network environment on the listed host(s). - - - alert: CephOSDReadErrors - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 - for: 30s - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors - summary: Device read errors detected - description: > - An OSD has encountered read errors, but the OSD has recovered by retrying - the reads. This may indicate an issue with the Hardware or Kernel. - # alert on high deviation from average PG count - - alert: CephPGImbalance - expr: | - abs( - ( - (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 - for: 5m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.4.5 - annotations: - summary: PG allocations are not balanced across devices - description: > - OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates - by more than 30% from average PG count. - # alert on high commit latency...but how high is too high - - - name: mds - rules: - - alert: CephFilesystemDamaged - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.5.1 - annotations: - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages - summary: Ceph filesystem is damaged. - description: > - The filesystems metadata has been corrupted. Data access - may be blocked. - - Either analyse the output from the mds daemon admin socket, or - escalate to support - - alert: CephFilesystemOffline - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.5.3 - annotations: - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down - summary: Ceph filesystem is offline - description: > - All MDS ranks are unavailable. The ceph daemons providing the metadata - for the Ceph filesystem are all down, rendering the filesystem offline. - - alert: CephFilesystemDegraded - expr: ceph_health_detail{name="FS_DEGRADED"} > 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.5.4 - annotations: - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded - summary: Ceph filesystem is degraded - description: > - One or more metdata daemons (MDS ranks) are failed or in a - damaged state. At best the filesystem is partially available, - worst case is the filesystem is completely unusable. - - alert: CephFilesystemMDSRanksLow - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max - summary: Ceph MDS daemon count is lower than configured - description: > - The filesystem's "max_mds" setting defined the number of MDS ranks in - the filesystem. The current number of active MDS daemons is less than - this setting. - - alert: CephFilesystemInsufficientStandby - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby - summary: Ceph filesystem standby daemons too low - description: > - The minimum number of standby daemons determined by standby_count_wanted - is less than the actual number of standby daemons. Adjust the standby count - or increase the number of mds daemons within the filesystem. - - alert: CephFilesystemFailureNoStandby - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.5.5 - annotations: - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds - summary: Ceph MDS daemon failed, no further standby available - description: > - An MDS daemon has failed, leaving only one active rank without - further standby. Investigate the cause of the failure or add a - standby daemon - - alert: CephFilesystemReadOnly - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.5.2 - annotations: - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages - summary: Ceph filesystem in read only mode, due to write error(s) - description: > - The filesystem has switched to READ ONLY due to an unexpected - write error, when writing to the metadata pool - - Either analyse the output from the mds daemon admin socket, or - escalate to support - - - name: mgr - rules: - - alert: CephMgrModuleCrash - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.6.1 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash - summary: A mgr module has recently crashed - description: > - One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A - crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to - investigate which module has failed, and archive it to acknowledge the failure. - - alert: CephMgrPrometheusModuleInactive - expr: up{job="ceph"} == 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.6.2 - annotations: - summary: Ceph's mgr/prometheus module is not available - description: > - The mgr/prometheus module at {{ $labels.instance }} is unreachable. This - could mean that the module has been disabled or the mgr itself is down. - - Without the mgr/prometheus module metrics and alerts will no longer - function. Open a shell to ceph and use 'ceph -s' to to determine whether the - mgr is active. If the mgr is not active, restart it, otherwise you can check - the mgr/prometheus module is loaded with 'ceph mgr module ls' and if it's - not listed as enabled, enable it with 'ceph mgr module enable prometheus' - - - name: pgs - rules: - - alert: CephPGsInactive - expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.7.1 - annotations: - summary: One or more Placement Groups are inactive - description: > - {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. - Inactive placement groups aren't able to serve read/write - requests. - - alert: CephPGsUnclean - expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0 - for: 15m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.7.2 - annotations: - summary: One or more platcment groups are marked unclean - description: > - {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}. - Unclean PGs haven't been able to completely recover from a previous failure. - - alert: CephPGsDamaged - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.7.4 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged - summary: Placement group damaged, manual intervention needed - description: > - During data consistency checks (scrub), at least one PG has been flagged as being - damaged or inconsistent. - - Check to see which PG is affected, and attempt a manual repair if neccessary. To list - problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use - the 'ceph pg repair ' command. - - alert: CephPGRecoveryAtRisk - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.7.5 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full - summary: OSDs are too full for automatic recovery - description: > - Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their - 'full' threshold. Add more capacity to the cluster, or delete unwanted data. - - alert: CephPGUnavilableBlockingIO - # PG_AVAILABILITY, but an OSD is not in a DOWN state - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.7.3 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability - summary: Placement group is unavailable, blocking some I/O - description: > - Data availability is reduced impacting the clusters abilty to service I/O to some data. One or - more placement groups (PGs) are in a state that blocks IO. - - alert: CephPGBackfillAtRisk - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.7.6 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full - summary: Backfill operations are blocked, due to lack of freespace - description: > - Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs - have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data. - - alert: CephPGNotScrubbed - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 - for: 5m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed - summary: Placement group(s) have not been scrubbed - description: | - One or more PGs have not been scrubbed recently. The scrub process is a data integrity - feature, protectng against bit-rot. It checks that objects and their metadata (size and - attributes) match across object replicas. When PGs miss their scrub window, it may - indicate the scrub window is too small, or PGs were not in a 'clean' state during the - scrub window. - - You can manually initiate a scrub with: ceph pg scrub - - alert: CephPGsHighPerOSD - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs - summary: Placement groups per OSD is too high - description: | - The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). - - Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status' - and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide - the autoscaler based on the expected relative size of the pool - (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') - - alert: CephPGNotDeepScrubbed - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 - for: 5m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed - summary: Placement group(s) have not been deep scrubbed - description: | - One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity - feature, protectng against bit-rot. It compares the contents of objects and their - replicas for inconsistency. When PGs miss their deep scrub window, it may indicate - that the window is too small or PGs were not in a 'clean' state during the deep-scrub - window. - - You can manually initiate a deep scrub with: ceph pg deep-scrub - - - name: nodes - rules: - - alert: CephNodeRootFilesystemFull - expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.8.1 - annotations: - summary: Root filesystem is dangerously full - description: > - Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free. - - # alert on nic packet errors and drops rates > 1% packets/s - - alert: CephNodeNetworkPacketDrops - expr: | - ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) >= 10 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.8.2 - annotations: - summary: One or more Nics is seeing packet drops - description: > - Node {{ $labels.instance }} experiences packet drop > 0.01% or > - 10 packets/s on interface {{ $labels.device }}. - - - alert: CephNodeNetworkPacketErrors - expr: | - ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) >= 10 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.8.3 - annotations: - summary: One or more Nics is seeing packet errors - description: > - Node {{ $labels.instance }} experiences packet errors > 0.01% or - > 10 packets/s on interface {{ $labels.device }}. - - # Restrict to device names beginning with '/' to skip false alarms from - # tmpfs, overlay type filesystems - - alert: CephNodeDiskspaceWarning - expr: | - predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) * - on(instance) group_left(nodename) node_uname_info < 0 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.8.4 - annotations: - summary: Host filesystem freespace is getting low - description: > - Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} - will be full in less than 5 days assuming the average fill-up - rate of the past 48 hours. - - - alert: CephNodeInconsistentMTU - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"})) - labels: - severity: warning - type: ceph_default - annotations: - summary: MTU settings across Ceph hosts are inconsistent - description: > - Node {{ $labels.instance }} has a different MTU size ({{ $value }}) - than the median value on device {{ $labels.device }}. - - - name: pools - rules: - - alert: CephPoolGrowthWarning - expr: | - (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) - group_right ceph_pool_metadata) >= 95 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.9.2 - annotations: - summary: Pool growth rate may soon exceed it's capacity - description: > - Pool '{{ $labels.name }}' will be full in less than 5 days - assuming the average fill-up rate of the past 48 hours. - - alert: CephPoolBackfillFull - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 - labels: - severity: warning - type: ceph_default - annotations: - summary: Freespace in a pool is too low for recovery/rebalance - description: > - A pool is approaching it's near full threshold, which will - prevent rebalance operations from completing. You should - consider adding more capacity to the pool. - - - alert: CephPoolFull - expr: ceph_health_detail{name="POOL_FULL"} > 0 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.9.1 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full - summary: Pool is full - writes are blocked - description: | - A pool has reached it's MAX quota, or the OSDs supporting the pool - have reached their FULL threshold. Until this is resolved, writes to - the pool will be blocked. - Pool Breakdown (top 5) - {{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }} - - {{ .Labels.name }} at {{ .Value }}% - {{- end }} - Either increase the pools quota, or add capacity to the cluster first - then increase it's quota (e.g. ceph osd pool set quota max_bytes ) - - alert: CephPoolNearFull - expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 - for: 5m - labels: - severity: warning - type: ceph_default - annotations: - summary: One or more Ceph pools are getting full - description: | - A pool has exceeeded it warning (percent full) threshold, or the OSDs - supporting the pool have reached their NEARFULL thresholds. Writes may - continue, but you are at risk of the pool going read only if more capacity - isn't made available. - - Determine the affected pool with 'ceph df detail', for example looking - at QUOTA BYTES and STORED. Either increase the pools quota, or add - capacity to the cluster first then increase it's quota - (e.g. ceph osd pool set quota max_bytes ) - - name: healthchecks - rules: - - alert: CephSlowOps - expr: ceph_healthcheck_slow_ops > 0 - for: 30s - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops - summary: MON/OSD operations are slow to complete - description: > - {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded) -# cephadm alerts - - name: cephadm - rules: - - alert: CephadmUpgradeFailed - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 - for: 30s - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.11.2 - annotations: - summary: Ceph version upgrade has failed - description: > - The cephadm cluster upgrade process has failed. The cluster remains in - an undetermined state. - - Please review the cephadm logs, to understand the nature of the issue - - alert: CephadmDaemonFailed - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 - for: 30s - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.11.1 - annotations: - summary: A ceph daemon manged by cephadm is down - description: > - A daemon managed by cephadm is no longer active. Determine, which - daemon is down with 'ceph health detail'. you may start daemons with - the 'ceph orch daemon start ' - - alert: CephadmPaused - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 - for: 1m - labels: - severity: warning - type: ceph_default - annotations: - documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused - summary: Orchestration tasks via cephadm are PAUSED - description: > - Cluster management has been paused manually. This will prevent the - orchestrator from service management and reconciliation. If this is - not intentional, resume cephadm operations with 'ceph orch resume' - -# prometheus alerts - - name: PrometheusServer - rules: - - alert: PrometheusJobMissing - expr: absent(up{job="ceph"}) - for: 30s - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.12.1 - annotations: - summary: The scrape job for Ceph is missing from Prometheus - description: | - The prometheus job that scrapes from Ceph is no longer defined, this - will effectively mean you'll have no metrics or alerts for the cluster. - - Please review the job definitions in the prometheus.yml file of the prometheus - instance. -# Object related events - - name: rados - rules: - - alert: CephObjectMissing - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1 - for: 30s - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.10.1 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound - summary: Object(s) has been marked UNFOUND - description: | - A version of a RADOS object can not be found, even though all OSDs are up. I/O - requests for this object from clients will block (hang). Resolving this issue may - require the object to be rolled back to a prior version manually, and manually verified. -# Generic - - name: generic - rules: - - alert: CephDaemonCrash - expr: ceph_health_detail{name="RECENT_CRASH"} == 1 - for: 1m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.1.2.1.1.2 - annotations: - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash - summary: One or more Ceph daemons have crashed, and are pending acknowledgement - description: | - One or more daemons have crashed recently, and need to be acknowledged. This notification - ensures that software crashes don't go unseen. To acknowledge a crash, use the - 'ceph crash archive ' command. diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml new file mode 100644 index 000000000000..fc38678f99dd --- /dev/null +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -0,0 +1,890 @@ +groups: + - name: cluster health + rules: + - alert: CephHealthError + expr: ceph_health_status == 2 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.2.1 + annotations: + summary: Cluster is in an ERROR state + description: > + Ceph in HEALTH_ERROR state for more than 5 minutes. + Please check "ceph health detail" for more information. + + - alert: CephHealthWarning + expr: ceph_health_status == 1 + for: 15m + labels: + severity: warning + type: ceph_default + annotations: + summary: Cluster is in a WARNING state + description: > + Ceph has been in HEALTH_WARN for more than 15 minutes. + Please check "ceph health detail" for more information. + + - name: mon + rules: + - alert: CephMonDownQuorumAtRisk + expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.3.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: Monitor quorum is at risk + description: | + {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active + Without quorum the cluster will become inoperable, affecting all connected clients and services. + + The following monitors are down: + {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + - alert: CephMonDown + expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)) + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: One of more ceph monitors are down + description: | + {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. + Quorum is still intact, but the loss of further monitors will make your cluster inoperable. + + The following monitors are down: + {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + - alert: CephMonDiskspaceCritical + expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.3.2 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit + summary: Disk space on at least one monitor is critically low + description: | + The free space available to a monitor's store is critically low (<5% by default). + You should increase the space available to the monitor(s). The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} + {{- end }} + + - alert: CephMonDiskspaceLow + expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low + summary: Disk space on at least one monitor is approaching full + description: | + The space available to a monitor's store is approaching full (>70% is the default). + You should increase the space available to the monitor store. The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} + {{- end }} + + - alert: CephMonClockSkew + expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew + summary: Clock skew across the Monitor hosts detected + description: | + The ceph monitors rely on a consistent time reference to maintain + quorum and cluster consistency. This event indicates that at least + one of your mons is not sync'd correctly. + + Review the cluster status with ceph -s. This will show which monitors + are affected. Check the time sync status on each monitor host. + + - name: osd + rules: + - alert: CephOSDDownHigh + expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.1 + annotations: + summary: More than 10% of OSDs are down + description: | + {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). + + The following OSDs are down: + {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + - alert: CephOSDHostDown + expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.8 + annotations: + summary: An OSD host is offline + description: | + The following OSDs are down: + {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} + - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} + {{- end }} + - alert: CephOSDDown + expr: ceph_health_detail{name="OSD_DOWN"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.2 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down + summary: An OSD has been marked down/unavailable + description: | + {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. + + The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: + {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + + - alert: CephOSDNearFull + expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.3 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull + summary: OSD(s) running low on free space (NEARFULL) + description: | + One or more OSDs have reached their NEARFULL threshold + + Use 'ceph health detail' to identify which OSDs have reached this threshold. + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: CephOSDFull + expr: ceph_health_detail{name="OSD_FULL"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.6 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full + summary: OSD(s) is full, writes blocked + description: | + An OSD has reached it's full threshold. Writes from all pools that share the + affected OSD will be blocked. + + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: CephOSDBackfillFull + expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull + summary: OSD(s) too full for backfill operations + description: | + An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations + completing for some pools. Check the current capacity utilisation with 'ceph df' + + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: CephOSDTooManyRepairs + expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs + summary: OSD has hit a high number of read errors + description: | + Reads from an OSD have used a secondary PG to return data to the client, indicating + a potential failing disk. + - alert: CephOSDTimeoutsPublicNetwork + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + summary: Network issues delaying OSD heartbeats (public network) + description: | + OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - alert: CephOSDTimeoutsClusterNetwork + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + summary: Network issues delaying OSD heartbeats (cluster network) + description: | + OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - alert: CephOSDInternalDiskSizeMismatch + expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch + summary: OSD size inconsistency error + description: | + One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata. + This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs. + - alert: CephDeviceFailurePredicted + expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2 + summary: Device(s) have been predicted to fail soon + description: | + The device health module has determined that one or more devices will fail + soon. To review the device states use 'ceph device ls'. To show a specific + device use 'ceph device info '. + + Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once + the osd is empty remove and replace the OSD. + - alert: CephDeviceFailurePredictionTooHigh + expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.7 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany + summary: Too many devices have been predicted to fail, unable to resolve + description: | + The device health module has determined that the number of devices predicted to + fail can not be remediated automatically, since it would take too many osd's out of + the cluster, impacting performance and potentially availabililty. You should add new + OSDs to the cluster to allow data to be relocated to avoid the data integrity issues. + - alert: CephDeviceFailureRelocationIncomplete + expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use + summary: A device failure is predicted, but unable to relocate data + description: | + The device health module has determined that one or more devices will fail + soon, but the normal process of relocating the data on the device to other + OSDs in the cluster is blocked. + + Check the the cluster has available freespace. It may be necessary to add + more disks to the cluster to allow the data from the failing device to + successfully migrate. + + - alert: CephOSDFlapping + expr: | + ( + rate(ceph_osd_up[5m]) + * on(ceph_daemon) group_left(hostname) ceph_osd_metadata + ) * 60 > 1 + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.4 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds + summary: Network issues are causing OSD's to flap (mark each other out) + description: > + OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was + marked down and back up at {{ $value | humanize }} times once a + minute for 5 minutes. This could indicate a network issue (latency, + packet drop, disruption) on the clusters "cluster network". Check the + network environment on the listed host(s). + + - alert: CephOSDReadErrors + expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors + summary: Device read errors detected + description: > + An OSD has encountered read errors, but the OSD has recovered by retrying + the reads. This may indicate an issue with the Hardware or Kernel. + # alert on high deviation from average PG count + - alert: CephPGImbalance + expr: | + abs( + ( + (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + for: 5m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.4.5 + annotations: + summary: PG allocations are not balanced across devices + description: > + OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates + by more than 30% from average PG count. + # alert on high commit latency...but how high is too high + + - name: mds + rules: + - alert: CephFilesystemDamaged + expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.1 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: Ceph filesystem is damaged. + description: > + The filesystems metadata has been corrupted. Data access + may be blocked. + + Either analyse the output from the mds daemon admin socket, or + escalate to support + - alert: CephFilesystemOffline + expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.3 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down + summary: Ceph filesystem is offline + description: > + All MDS ranks are unavailable. The ceph daemons providing the metadata + for the Ceph filesystem are all down, rendering the filesystem offline. + - alert: CephFilesystemDegraded + expr: ceph_health_detail{name="FS_DEGRADED"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.4 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded + summary: Ceph filesystem is degraded + description: > + One or more metdata daemons (MDS ranks) are failed or in a + damaged state. At best the filesystem is partially available, + worst case is the filesystem is completely unusable. + - alert: CephFilesystemMDSRanksLow + expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max + summary: Ceph MDS daemon count is lower than configured + description: > + The filesystem's "max_mds" setting defined the number of MDS ranks in + the filesystem. The current number of active MDS daemons is less than + this setting. + - alert: CephFilesystemInsufficientStandby + expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby + summary: Ceph filesystem standby daemons too low + description: > + The minimum number of standby daemons determined by standby_count_wanted + is less than the actual number of standby daemons. Adjust the standby count + or increase the number of mds daemons within the filesystem. + - alert: CephFilesystemFailureNoStandby + expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.5 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds + summary: Ceph MDS daemon failed, no further standby available + description: > + An MDS daemon has failed, leaving only one active rank without + further standby. Investigate the cause of the failure or add a + standby daemon + - alert: CephFilesystemReadOnly + expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.5.2 + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: Ceph filesystem in read only mode, due to write error(s) + description: > + The filesystem has switched to READ ONLY due to an unexpected + write error, when writing to the metadata pool + + Either analyse the output from the mds daemon admin socket, or + escalate to support + + - name: mgr + rules: + - alert: CephMgrModuleCrash + expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.6.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash + summary: A mgr module has recently crashed + description: > + One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A + crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to + investigate which module has failed, and archive it to acknowledge the failure. + - alert: CephMgrPrometheusModuleInactive + expr: up{job="ceph"} == 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.6.2 + annotations: + summary: Ceph's mgr/prometheus module is not available + description: > + The mgr/prometheus module at {{ $labels.instance }} is unreachable. This + could mean that the module has been disabled or the mgr itself is down. + + Without the mgr/prometheus module metrics and alerts will no longer + function. Open a shell to ceph and use 'ceph -s' to to determine whether the + mgr is active. If the mgr is not active, restart it, otherwise you can check + the mgr/prometheus module is loaded with 'ceph mgr module ls' and if it's + not listed as enabled, enable it with 'ceph mgr module enable prometheus' + + - name: pgs + rules: + - alert: CephPGsInactive + expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.1 + annotations: + summary: One or more Placement Groups are inactive + description: > + {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. + Inactive placement groups aren't able to serve read/write + requests. + - alert: CephPGsUnclean + expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0 + for: 15m + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.2 + annotations: + summary: One or more platcment groups are marked unclean + description: > + {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}. + Unclean PGs haven't been able to completely recover from a previous failure. + - alert: CephPGsDamaged + expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.4 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged + summary: Placement group damaged, manual intervention needed + description: > + During data consistency checks (scrub), at least one PG has been flagged as being + damaged or inconsistent. + + Check to see which PG is affected, and attempt a manual repair if neccessary. To list + problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use + the 'ceph pg repair ' command. + - alert: CephPGRecoveryAtRisk + expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.5 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full + summary: OSDs are too full for automatic recovery + description: > + Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their + 'full' threshold. Add more capacity to the cluster, or delete unwanted data. + - alert: CephPGUnavilableBlockingIO + # PG_AVAILABILITY, but an OSD is not in a DOWN state + expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.3 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability + summary: Placement group is unavailable, blocking some I/O + description: > + Data availability is reduced impacting the clusters abilty to service I/O to some data. One or + more placement groups (PGs) are in a state that blocks IO. + - alert: CephPGBackfillAtRisk + expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.7.6 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full + summary: Backfill operations are blocked, due to lack of freespace + description: > + Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs + have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data. + - alert: CephPGNotScrubbed + expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed + summary: Placement group(s) have not been scrubbed + description: | + One or more PGs have not been scrubbed recently. The scrub process is a data integrity + feature, protectng against bit-rot. It checks that objects and their metadata (size and + attributes) match across object replicas. When PGs miss their scrub window, it may + indicate the scrub window is too small, or PGs were not in a 'clean' state during the + scrub window. + + You can manually initiate a scrub with: ceph pg scrub + - alert: CephPGsHighPerOSD + expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs + summary: Placement groups per OSD is too high + description: | + The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). + + Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status' + and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide + the autoscaler based on the expected relative size of the pool + (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') + - alert: CephPGNotDeepScrubbed + expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed + summary: Placement group(s) have not been deep scrubbed + description: | + One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity + feature, protectng against bit-rot. It compares the contents of objects and their + replicas for inconsistency. When PGs miss their deep scrub window, it may indicate + that the window is too small or PGs were not in a 'clean' state during the deep-scrub + window. + + You can manually initiate a deep scrub with: ceph pg deep-scrub + + - name: nodes + rules: + - alert: CephNodeRootFilesystemFull + expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 + for: 5m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.8.1 + annotations: + summary: Root filesystem is dangerously full + description: > + Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free. + + # alert on nic packet errors and drops rates > 1% packets/s + - alert: CephNodeNetworkPacketDrops + expr: | + ( + increase(node_network_receive_drop_total{device!="lo"}[1m]) + + increase(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + increase(node_network_receive_packets_total{device!="lo"}[1m]) + + increase(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + increase(node_network_receive_drop_total{device!="lo"}[1m]) + + increase(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.8.2 + annotations: + summary: One or more Nics is seeing packet drops + description: > + Node {{ $labels.instance }} experiences packet drop > 0.01% or > + 10 packets/s on interface {{ $labels.device }}. + + - alert: CephNodeNetworkPacketErrors + expr: | + ( + increase(node_network_receive_errs_total{device!="lo"}[1m]) + + increase(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + increase(node_network_receive_packets_total{device!="lo"}[1m]) + + increase(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + increase(node_network_receive_errs_total{device!="lo"}[1m]) + + increase(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.8.3 + annotations: + summary: One or more Nics is seeing packet errors + description: > + Node {{ $labels.instance }} experiences packet errors > 0.01% or + > 10 packets/s on interface {{ $labels.device }}. + + # Restrict to device names beginning with '/' to skip false alarms from + # tmpfs, overlay type filesystems + - alert: CephNodeDiskspaceWarning + expr: | + predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) * + on(instance) group_left(nodename) node_uname_info < 0 + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.8.4 + annotations: + summary: Host filesystem freespace is getting low + description: > + Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} + will be full in less than 5 days assuming the average fill-up + rate of the past 48 hours. + + - alert: CephNodeInconsistentMTU + expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"})) + labels: + severity: warning + type: ceph_default + annotations: + summary: MTU settings across Ceph hosts are inconsistent + description: > + Node {{ $labels.instance }} has a different MTU size ({{ $value }}) + than the median value on device {{ $labels.device }}. + + - name: pools + rules: + - alert: CephPoolGrowthWarning + expr: | + (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) + group_right ceph_pool_metadata) >= 95 + labels: + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.9.2 + annotations: + summary: Pool growth rate may soon exceed it's capacity + description: > + Pool '{{ $labels.name }}' will be full in less than 5 days + assuming the average fill-up rate of the past 48 hours. + - alert: CephPoolBackfillFull + expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 + labels: + severity: warning + type: ceph_default + annotations: + summary: Freespace in a pool is too low for recovery/rebalance + description: > + A pool is approaching it's near full threshold, which will + prevent rebalance operations from completing. You should + consider adding more capacity to the pool. + + - alert: CephPoolFull + expr: ceph_health_detail{name="POOL_FULL"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.9.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full + summary: Pool is full - writes are blocked + description: | + A pool has reached it's MAX quota, or the OSDs supporting the pool + have reached their FULL threshold. Until this is resolved, writes to + the pool will be blocked. + Pool Breakdown (top 5) + {{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }} + - {{ .Labels.name }} at {{ .Value }}% + {{- end }} + Either increase the pools quota, or add capacity to the cluster first + then increase it's quota (e.g. ceph osd pool set quota max_bytes ) + - alert: CephPoolNearFull + expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + summary: One or more Ceph pools are getting full + description: | + A pool has exceeeded it warning (percent full) threshold, or the OSDs + supporting the pool have reached their NEARFULL thresholds. Writes may + continue, but you are at risk of the pool going read only if more capacity + isn't made available. + + Determine the affected pool with 'ceph df detail', for example looking + at QUOTA BYTES and STORED. Either increase the pools quota, or add + capacity to the cluster first then increase it's quota + (e.g. ceph osd pool set quota max_bytes ) + - name: healthchecks + rules: + - alert: CephSlowOps + expr: ceph_healthcheck_slow_ops > 0 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: MON/OSD operations are slow to complete + description: > + {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded) +# cephadm alerts + - name: cephadm + rules: + - alert: CephadmUpgradeFailed + expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.11.2 + annotations: + summary: Ceph version upgrade has failed + description: > + The cephadm cluster upgrade process has failed. The cluster remains in + an undetermined state. + + Please review the cephadm logs, to understand the nature of the issue + - alert: CephadmDaemonFailed + expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.11.1 + annotations: + summary: A ceph daemon manged by cephadm is down + description: > + A daemon managed by cephadm is no longer active. Determine, which + daemon is down with 'ceph health detail'. you may start daemons with + the 'ceph orch daemon start ' + - alert: CephadmPaused + expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused + summary: Orchestration tasks via cephadm are PAUSED + description: > + Cluster management has been paused manually. This will prevent the + orchestrator from service management and reconciliation. If this is + not intentional, resume cephadm operations with 'ceph orch resume' + +# prometheus alerts + - name: PrometheusServer + rules: + - alert: PrometheusJobMissing + expr: absent(up{job="ceph"}) + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.12.1 + annotations: + summary: The scrape job for Ceph is missing from Prometheus + description: | + The prometheus job that scrapes from Ceph is no longer defined, this + will effectively mean you'll have no metrics or alerts for the cluster. + + Please review the job definitions in the prometheus.yml file of the prometheus + instance. +# Object related events + - name: rados + rules: + - alert: CephObjectMissing + expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1 + for: 30s + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.10.1 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound + summary: Object(s) has been marked UNFOUND + description: | + A version of a RADOS object can not be found, even though all OSDs are up. I/O + requests for this object from clients will block (hang). Resolving this issue may + require the object to be rolled back to a prior version manually, and manually verified. +# Generic + - name: generic + rules: + - alert: CephDaemonCrash + expr: ceph_health_detail{name="RECENT_CRASH"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.1.2 + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash + summary: One or more Ceph daemons have crashed, and are pending acknowledgement + description: | + One or more daemons have crashed recently, and need to be acknowledged. This notification + ensures that software crashes don't go unseen. To acknowledge a crash, use the + 'ceph crash archive ' command. diff --git a/monitoring/ceph-mixin/tests_alerts/settings.py b/monitoring/ceph-mixin/tests_alerts/settings.py index 9dc639fd30cb..d99dfdca6fbd 100644 --- a/monitoring/ceph-mixin/tests_alerts/settings.py +++ b/monitoring/ceph-mixin/tests_alerts/settings.py @@ -1,6 +1,6 @@ import os -ALERTS_FILE = '../prometheus_alerts.yaml' +ALERTS_FILE = '../prometheus_alerts.yml' UNIT_TESTS_FILE = 'test_alerts.yml' MIB_FILE = '../../snmp/CEPH-MIB.txt' diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 14dfb942b62f..d7fe0132d2da 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -1,5 +1,5 @@ rule_files: - - ../prometheus_alerts.yaml + - ../prometheus_alerts.yml evaluation_interval: 5m tests: # health error diff --git a/monitoring/ceph-mixin/tox.ini b/monitoring/ceph-mixin/tox.ini index e15e17084f7c..e6cae299d68f 100644 --- a/monitoring/ceph-mixin/tox.ini +++ b/monitoring/ceph-mixin/tox.ini @@ -64,6 +64,6 @@ depends = grafonnet-check whitelist_externals = promtool commands = - lint: promtool check rules prometheus_alerts.yaml + lint: promtool check rules prometheus_alerts.yml test: pytest -rA tests_alerts/test_syntax.py tests_alerts/test_unittests.py python3 ./tests_alerts/validate_rules.py diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index 10b608fe139e..8eaae9261965 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -2592,8 +2592,8 @@ def get_container_mounts(ctx, fsid, daemon_type, daemon_id, mounts[ceph_folder + '/src/cephadm/cephadm'] = '/usr/sbin/cephadm' mounts[ceph_folder + '/src/pybind/mgr'] = '/usr/share/ceph/mgr' mounts[ceph_folder + '/src/python-common/ceph'] = '/usr/lib/python3.6/site-packages/ceph' - mounts[ceph_folder + '/monitoring/grafana/dashboards'] = '/etc/grafana/dashboards/ceph-dashboard' - mounts[ceph_folder + '/monitoring/prometheus/alerts'] = '/etc/prometheus/ceph' + mounts[ceph_folder + '/monitoring/ceph-mixin/dashboards_out'] = '/etc/grafana/dashboards/ceph-dashboard' + mounts[ceph_folder + '/monitoring/ceph-mixin/prometheus_alerts.yml'] = '/etc/prometheus/ceph/ceph_default_alerts.yml' else: logger.error('{}{}{}'.format(termcolor.red, 'Ceph shared source folder does not exist.', diff --git a/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py b/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py index 03b31a45e173..d37337b404ed 100644 --- a/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py +++ b/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py @@ -10,7 +10,7 @@ Usage: e.g. cd /ceph/src/pybind/mgr/dashboard - python ci/