--- /dev/null
+rule_files:
+ - ceph_default_alerts.yml
+evaluation_interval: 5m
+tests:
+ # health error
+ - interval: 5m
+ input_series:
+ - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+ values: '2 2 2 2 2 2 2'
+ promql_expr_test:
+ - expr: ceph_health_status == 2
+ eval_time: 5m
+ exp_samples:
+ - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+ value: 2
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: health error
+ - eval_time: 6m
+ alertname: health error
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ job: ceph
+ oid: 1.3.6.1.4.1.50495.15.1.2.2.1
+ type: ceph_default
+ severity: critical
+ exp_annotations:
+ description: >
+ Ceph in HEALTH_ERROR state for more than 5 minutes.
+ Please check "ceph health detail" for more information.
+
+ # health warning
+ - interval: 5m
+ input_series:
+ - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: ceph_health_status == 1
+ eval_time: 15m
+ exp_samples:
+ - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 10m
+ alertname: health warn
+ - eval_time: 20m
+ alertname: health warn
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ job: ceph
+ oid: 1.3.6.1.4.1.50495.15.1.2.2.2
+ type: ceph_default
+ severity: warning
+ exp_annotations:
+ description: >
+ Ceph has been in HEALTH_WARN for more than 15 minutes.
+ Please check "ceph health detail" for more information.
+
+ # low monitor quorum count
+ - interval: 1m
+ input_series:
+ - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283",
+ job="ceph"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283",
+ job="ceph"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283",
+ job="ceph"}'
+ values: '0 0 0 0 0'
+ - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version
+ 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+ (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+ public_addr="172.20.0.2",rank="0"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version
+ 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+ (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+ public_addr="172.20.0.2",rank="1"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version
+ 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+ (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+ public_addr="172.20.0.2",rank="2"}'
+ values: '1 1 1 1 1'
+ promql_expr_test:
+ - expr: sum(ceph_mon_quorum_status) < 3
+ eval_time: 1m
+ exp_samples:
+ - labels: '{}'
+ value: 2
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: low monitor quorum count
+ exp_alerts:
+ - exp_labels:
+ oid: 1.3.6.1.4.1.50495.15.1.2.3.1
+ type: ceph_default
+ severity: critical
+ exp_annotations:
+ description: |
+ Monitor count in quorum is below three.
+
+ Only 2 of 3 monitors are active.
+
+ The following monitors are down:
+ - mon.c on ceph
+
+
+ # 10% OSDs down
+ - interval: 1m
+ input_series:
+ - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+ values: '0 0 0 0 0'
+ - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1'
+ promql_expr_test:
+ - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
+ eval_time: 1m
+ exp_samples:
+ - labels: '{}'
+ value: 3.333333333333333E+01
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: 10% OSDs down
+ exp_alerts:
+ - exp_labels:
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.1
+ type: ceph_default
+ severity: critical
+ exp_annotations:
+ description: |
+ 33.33% or 1 of 3 OSDs are down (≥ 10%).
+
+ The following OSDs are down:
+ - osd.1 on ceph
+
+ # OSD down
+ - interval: 1m
+ input_series:
+ - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+ values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
+ - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: count(ceph_osd_up == 0) > 0
+ eval_time: 1m
+ exp_samples:
+ - labels: '{}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 15m
+ alertname: OSD down
+ exp_alerts:
+ - exp_labels:
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.2
+ type: ceph_default
+ severity: warning
+ exp_annotations:
+ description: |
+
+ 1 OSD down for more than 15 minutes.
+
+ 1 of 3 OSDs are down.
+
+ The following OSD is down:
+ - osd.1 on ceph
+
+ # OSDs near full
+ - interval: 1m
+ input_series:
+ - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283"
+ ,job="ceph"}'
+ values: '1076310016 1076310016 1076310016 1076310016 1076310016
+ 1076310016'
+ - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283"
+ ,job="ceph"}'
+ values: '1076310016 1076310016 1076310016 1076310016 1076310016
+ 1076310016'
+ - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283"
+ ,job="ceph"}'
+ values: '1076310016 1076310016 1076310016 1076310016 1076310016
+ 106447810032'
+ - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283"
+ ,job="ceph"}'
+ values: '108447916032 108447916032 108447916032 108447916032 108447916032
+ 108447916032'
+ - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283"
+ ,job="ceph"}'
+ values: '108447916032 108447916032 108447916032 108447916032 108447916032
+ 108447916032'
+ - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283"
+ ,job="ceph"}'
+ values: '108447916032 108447916032 108447916032 108447916032 108447916032
+ 108447916032'
+ - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: |
+ (
+ ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon)
+ ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname)
+ ceph_osd_metadata
+ ) * 100 > 90
+
+ eval_time: 5m
+ exp_samples:
+ - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283",
+ job="ceph"}'
+ value: 9.815569899986845E+01
+ alert_rule_test:
+ - eval_time: 10m
+ alertname: OSDs near full
+ exp_alerts:
+ - exp_labels:
+ ceph_daemon: osd.2
+ hostname: ceph
+ instance: ceph:9283
+ job: ceph
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.3
+ type: ceph_default
+ severity: critical
+ exp_annotations:
+ description: >
+ OSD osd.2 on ceph is dangerously full: 98.16%
+
+ # flapping OSD
+ - interval: 1s
+ input_series:
+ - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+ values: '1+1x100'
+ - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+ values: '1+0x100'
+ - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+ values: '1+0x100'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: |
+ (
+ rate(ceph_osd_up[5m])
+ * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
+ ) * 60 > 1
+ eval_time: 1m
+ exp_samples:
+ - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
+ job="ceph"}'
+ value: 1.2200000000000001E+01
+ alert_rule_test:
+ - eval_time: 5m
+ alertname: flapping OSD
+ exp_alerts:
+ - exp_labels:
+ ceph_daemon: osd.0
+ hostname: ceph
+ instance: ceph:9283
+ job: ceph
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.4
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ description: >
+ OSD osd.0 on ceph was
+ marked down and back up at 20.1 times once a
+ minute for 5 minutes.
+
+ # high pg count deviation
+ - interval: 1m
+ input_series:
+ - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
+ job="ceph"}'
+ values: '169 169 169 169 169 169'
+ - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
+ job="ceph"}'
+ values: '169 169 169 169 169 90'
+ - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
+ job="ceph"}'
+ values: '169 169 169 169 169 169'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+ ceph_version="ceph version 17.0.0-189-g3558fd72
+ (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+ cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+ hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+ public_addr="172.20.0.2"}'
+ values: '1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: |
+ abs(
+ (
+ (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
+ by (job)
+ ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+ ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+
+ eval_time: 5m
+ exp_samples:
+ - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
+ job="ceph"}'
+ value: 3.691588785046729E-01
+ alert_rule_test:
+ - eval_time: 10m
+ alertname: high pg count deviation
+ exp_alerts:
+ - exp_labels:
+ ceph_daemon: osd.1
+ hostname: ceph
+ instance: ceph:9283
+ job: ceph
+ oid: 1.3.6.1.4.1.50495.15.1.2.4.5
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ description: >
+ OSD osd.1 on ceph deviates
+ by more than 30% from average PG count.
+
+ # pgs inactive
+ - interval: 1m
+ input_series:
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="device_health_metrics",pool_id="1"}'
+ values: '1 1 1 1 1 1 1 1'
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="device_health_metrics",pool_id="2"}'
+ values: '1 1 1 1 1 1 1 1'
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="device_health_metrics",pool_id="3"}'
+ values: '1 1 1 1 1 1 1 1'
+ - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+ values: '1 1 1 1 1 1 1 1'
+ - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
+ values: '32 32 32 32 32 32 32 32'
+ - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
+ values: '33 32 32 32 32 33 33 32'
+ - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
+ values: '1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
+ values: '32 32 32 32 32 32 32 32'
+ - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
+ values: '32 32 32 32 32 32 32 32'
+ promql_expr_test:
+ - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
+ (ceph_pg_total - ceph_pg_active) > 0
+ eval_time: 5m
+ exp_samples:
+ - labels: '{instance="ceph:9283", job="ceph",
+ name="device_health_metrics",
+ pool_id="3"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 5m
+ alertname: pgs inactive
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ job: ceph
+ name: device_health_metrics
+ oid: 1.3.6.1.4.1.50495.15.1.2.7.1
+ pool_id: 3
+ severity: critical
+ type: ceph_default
+ exp_annotations:
+ description: >
+ 1 PGs have been inactive for more than 5 minutes in pool
+ device_health_metrics.
+ Inactive placement groups aren't able to serve read/write
+ requests.
+
+ #pgs unclean
+ - interval: 1m
+ input_series:
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="device_health_metrics",pool_id="1"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="device_health_metrics",pool_id="2"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="device_health_metrics",pool_id="3"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
+ values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
+ 32 32 32'
+ - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
+ values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
+ 33 33'
+ - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
+ values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
+ values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
+ 32 32'
+ - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
+ values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
+ 32 32'
+ promql_expr_test:
+ - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
+ (ceph_pg_total - ceph_pg_clean) > 0
+ eval_time: 15m
+ exp_samples:
+ - labels: '{instance="ceph:9283", job="ceph",
+ name="device_health_metrics", pool_id="3"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 16m
+ alertname: pgs unclean
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ job: ceph
+ name: device_health_metrics
+ oid: 1.3.6.1.4.1.50495.15.1.2.7.2
+ pool_id: 3
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ description: >
+ 1 PGs haven't been clean for more than 15 minutes in pool
+ device_health_metrics.
+ Unclean PGs haven't been able to completely recover from a
+ previous failure.
+
+ # root volume full
+ - interval: 1m
+ input_series:
+ - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
+ --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+ mountpoint="/"}'
+ values: '35336400896 35336400896 35336400896 35336400896 35336400896
+ 3533640089 3533640089'
+ - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
+ --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+ mountpoint="/"}'
+ values: '73445531648 73445531648 73445531648 73445531648 73445531648
+ 73445531648 73445531648'
+ promql_expr_test:
+ - expr: node_filesystem_avail_bytes{mountpoint="/"} /
+ node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
+ eval_time: 5m
+ exp_samples:
+ - labels: '{device="/dev/mapper/fedora_localhost --live-home",
+ fstype="ext4", instance="node-exporter", job="node-exporter",
+ mountpoint="/"}'
+ value: 4.8112390362092565E+00
+ alert_rule_test:
+ - eval_time: 10m
+ alertname: root volume full
+ exp_alerts:
+ - exp_labels:
+ device: /dev/mapper/fedora_localhost --live-home
+ fstype: ext4
+ instance: node-exporter
+ job: node-exporter
+ mountpoint: /
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.1
+ severity: critical
+ type: ceph_default
+ exp_annotations:
+ description: >
+ Root volume (OSD and MON store) is dangerously full: 4.811% free.
+
+ # network packets dropped
+ - interval: 1s
+ input_series:
+ - series: 'node_network_receive_drop_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '1+1x500'
+ - series: 'node_network_transmit_drop_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '1+1x500'
+ promql_expr_test:
+ - expr: |
+ (
+ increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) / (
+ increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0001 or (
+ increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_drop_total{device!="lo"}[1m])
+ ) >= 10
+
+ eval_time: 5m
+ exp_samples:
+ - labels: '{device="eth0", instance="node-exporter",
+ job="node-exporter"}'
+ value: 1.2E+02
+ alert_rule_test:
+ - eval_time: 5m
+ alertname: network packets dropped
+ exp_alerts:
+ - exp_labels:
+ device: eth0
+ instance: node-exporter
+ job: node-exporter
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.2
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ description: >
+ Node node-exporter experiences packet drop > 0.01% or >
+ 10 packets/s on interface eth0.
+
+ # network packets errors
+ - interval: 1s
+ input_series:
+ - series: 'node_network_receive_errs_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '1+1x500'
+ - series: 'node_network_transmit_errs_total{device="eth0",
+ instance="node-exporter",job="node-exporter"}'
+ values: '1+1x500'
+ promql_expr_test:
+ - expr: |
+ (
+ increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) / (
+ increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_packets_total{device!="lo"}[1m])
+ ) >= 0.0001 or (
+ increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+ increase(node_network_transmit_errs_total{device!="lo"}[1m])
+ ) >= 10
+
+ eval_time: 5m
+ exp_samples:
+ - labels: '{device="eth0", instance="node-exporter",
+ job="node-exporter"}'
+ value: 1.2E+02
+ alert_rule_test:
+ - eval_time: 5m
+ alertname: network packet errors
+ exp_alerts:
+ - exp_labels:
+ device: eth0
+ instance: node-exporter
+ job: node-exporter
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.3
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ description: >
+ Node node-exporter experiences packet errors > 0.01% or > 10
+ packets/s on interface eth0.
+
+ # MTU Mismatch
+ - interval: 1m
+ input_series:
+ - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
+ job="node-exporter"}'
+ values: '1500 1500 1500 1500 1500'
+ - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
+ job="node-exporter"}'
+ values: '1500 1500 1500 1500 1500'
+ - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
+ job="node-exporter"}'
+ values: '1500 1500 1500 1500 1500'
+ - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
+ job="node-exporter"}'
+ values: '1500 1500 1500 1500 1500'
+ - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
+ job="node-exporter"}'
+ values: '9000 9000 9000 9000 9000'
+ promql_expr_test:
+ - expr: node_network_mtu_bytes{device!="lo"} != on() group_left()
+ (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
+ eval_time: 1m
+ exp_samples:
+ - labels: '{__name__="node_network_mtu_bytes", device="eth4",
+ instance="node-exporter", job="node-exporter"}'
+ value: 9000
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: MTU Mismatch
+ exp_alerts:
+ - exp_labels:
+ device: eth4
+ instance: node-exporter
+ job: node-exporter
+ oid: 1.3.6.1.4.1.50495.15.1.2.8.5
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ description: >
+ Node node-exporter has a different MTU size (9000)
+ than the median value on device eth4.
+
+ # pool full
+ - interval: 1m
+ input_series:
+ - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="1"}'
+ values: '0 0 0 0 0 0 0 0 0'
+ - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="2"}'
+ values: '1850 1850 1850 1850 1850 1850 1850'
+ - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="3"}'
+ values: '10628706304000 10628706304000 23524 23524 23524 23524 23524 23524
+ 23524'
+ - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="1"}'
+ values: '106287063040 106287063040 106287063040 106287063040 106287063040
+ 106287063040 106287063040'
+ - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="2"}'
+ values: '106287063040 106287063040 106287063040 106287063040 106287063040
+ 106287063040 106287063040'
+ - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="3"}'
+ values: '106287063040 1 106287063040 106287063040 106287063040
+ 106287063040 106287063040'
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="device_health_metrics",pool_id="1"}'
+ values: '1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name=".rgw.root",pool_id="2"}'
+ values: '1 1 1 1 1 1 1 1 1'
+ - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+ name="default.rgw.log",pool_id="3"}'
+ values: '1 1 1 1 1 1 1 1 1'
+ promql_expr_test:
+ - expr: |
+ ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
+ * on(pool_id) group_right ceph_pool_metadata * 100 > 90
+
+ eval_time: 1m
+ exp_samples:
+ - labels: '{instance="ceph:9283", job="ceph", name="default.rgw.log",
+ pool_id="3"}'
+ value: 9.999999999999059E+01
+ alert_rule_test:
+ - eval_time: 2m
+ alertname: pool full
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ job: ceph
+ name: default.rgw.log
+ oid: 1.3.6.1.4.1.50495.15.1.2.9.1
+ pool_id: 3
+ severity: critical
+ type: ceph_default
+ exp_annotations:
+ description: Pool default.rgw.log at 99.01% capacity.
+
+ # slow OSD ops
+ - interval : 1m
+ input_series:
+ - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
+ values: '1+0x120'
+ promql_expr_test:
+ - expr: ceph_healthcheck_slow_ops > 0
+ eval_time: 1m
+ exp_samples:
+ - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
+ job="ceph"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 20m
+ alertname: Slow OSD Ops
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ job: ceph
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ description: >
+ 1 OSD requests are taking too long to process
+ (osd_op_complaint_time exceeded)