labels:
severity: critical
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.2.1
annotations:
- description: Ceph in health_error state for more than 5m
+ description: Ceph in health_error state for more than 5m.
- alert: health warn
expr: ceph_health_status == 1
for: 15m
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.2.2
annotations:
description: Ceph in health_warn for more than 15m.
- name: mon
labels:
severity: critical
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.3.1
annotations:
description: Monitor count in quorum is low.
- name: osd
labels:
severity: critical
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.1
annotations:
description: More than 10% of OSDs are down.
- alert: OSD down
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.2
annotations:
description: One or more OSDs down for more than 15 minutes.
- alert: OSDs near full
labels:
severity: critical
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.3
annotations:
description: OSD {{ $labels.ceph_daemon }} is dangerously full, over 80%.
# alert on single OSDs flapping
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.4
annotations:
description: >
OSD {{ $labels.ceph_daemon }} was marked down and back up at least once a
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.5
annotations:
description: >
OSD {{ $labels.ceph_daemon }} deviates by more than 30% from
labels:
severity: critical
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.7.1
annotations:
description: One or more PGs are inactive for more than 5 minutes.
- alert: pgs unclean
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.7.2
annotations:
description: One or more PGs are not clean for more than 15 minutes.
- name: nodes
labels:
severity: critical
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.1
annotations:
description: Root volume (OSD and MON store) is dangerously full (< 5% free).
# alert on nic packet errors and drops rates > 1 packet/s
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.2
annotations:
description: >
Node {{ $labels.instance }} experiences packet drop > 1
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.3
annotations:
description: >
Node {{ $labels.instance }} experiences packet errors > 1
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.4
annotations:
description: >
Mountpoint {{ $labels.mountpoint }} will be full in less than 5 days
labels:
severity: critical
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.9.1
annotations:
description: Pool {{ $labels.name }} at 90% capacity or over.
- alert: pool filling up
labels:
severity: warning
type: ceph_default
+ oid: 1.3.6.1.4.1.50495.15.1.2.9.2
annotations:
description: >
Pool {{ $labels.name }} will be full in less than 5 days