rules: [
{
alert: 'CephPoolGrowthWarning',
- expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id) group_right ceph_pool_metadata) >= 95' % $.MultiClusterQuery(),
+ expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id, instance) group_right() ceph_pool_metadata) >= 95' % $.MultiClusterQuery(),
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.2' },
annotations: {
summary: 'Pool growth rate may soon exceed capacity%(cluster)s' % $.MultiClusterSummary(),
type: "ceph_default"
- alert: "CephNodeNetworkBondDegraded"
annotations:
- summary: "Degraded Bond on Node {{ $labels.instance }}"
description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
+ summary: "Degraded Bond on Node {{ $labels.instance }}"
expr: |
node_bonding_slaves - node_bonding_active != 0
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephDaemonSlowOps"
- for: "30s"
- expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
- labels:
- severity: 'warning'
- type: 'ceph_default'
annotations:
- summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
+ summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+ expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+ for: "30s"
+ labels:
+ severity: "warning"
+ type: "ceph_default"
- name: "cephadm"
rules:
- alert: "CephadmUpgradeFailed"