From dd3da4ebf0296d9e589fc26246a81bef85a9bed5 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Wed, 4 Oct 2023 10:00:26 +0000 Subject: [PATCH] monitoring: add new alerts This adds new hardware monitoring alerts. Signed-off-by: Guillaume Abrioux (cherry picked from commit 76d8e0bbbf2c5130a325943ffe09791cbd4f2feb) --- .../ceph-mixin/prometheus_alerts.libsonnet | 65 ++++++++ monitoring/ceph-mixin/prometheus_alerts.yml | 62 ++++++++ .../ceph-mixin/tests_alerts/test_alerts.yml | 144 ++++++++++++++++++ monitoring/snmp/README.md | 1 + 4 files changed, 272 insertions(+) diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index b7ec0da2f04..a6ab4c2a3f9 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -689,6 +689,71 @@ }, ], }, + { + name: 'hardware', + rules: [ + { + alert: 'HardwareStorageError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_STORAGE"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.1' }, + annotations: { + summary: 'Storage devices error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Some storage devices are in error. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareMemoryError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_MEMORY"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.2' }, + annotations: { + summary: 'DIMM error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'DIMM error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareProcessorError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.3' }, + annotations: { + summary: 'Processor error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Processor error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareNetworkError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_NETWORK"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.4' }, + annotations: { + summary: 'Network error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Network error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwarePowerError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_POWER"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.5' }, + annotations: { + summary: 'Power supply error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Power supply error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareFanError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_FANS"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.6' }, + annotations: { + summary: 'Fan error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Fan error(s) detected. Check `ceph health detail`.', + }, + }, + ], + }, { name: 'PrometheusServer', rules: [ diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 4a3e6acf389..e491c753f3c 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -614,6 +614,68 @@ groups: labels: severity: "warning" type: "ceph_default" + - name: "hardware" + rules: + - alert: "HardwareStorageError" + annotations: + description: "Some storage devices are in error. Check `ceph health detail`." + summary: "Storage devices error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.1" + severity: "critical" + type: "ceph_default" + - alert: "HardwareMemoryError" + annotations: + description: "DIMM error(s) detected. Check `ceph health detail`." + summary: "DIMM error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.2" + severity: "critical" + type: "ceph_default" + - alert: "HardwareProcessorError" + annotations: + description: "Processor error(s) detected. Check `ceph health detail`." + summary: "Processor error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.3" + severity: "critical" + type: "ceph_default" + - alert: "HardwareNetworkError" + annotations: + description: "Network error(s) detected. Check `ceph health detail`." + summary: "Network error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.4" + severity: "critical" + type: "ceph_default" + - alert: "HardwarePowerError" + annotations: + description: "Power supply error(s) detected. Check `ceph health detail`." + summary: "Power supply error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.5" + severity: "critical" + type: "ceph_default" + - alert: "HardwareFanError" + annotations: + description: "Fan error(s) detected. Check `ceph health detail`." + summary: "Fan error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.6" + severity: "critical" + type: "ceph_default" - name: "PrometheusServer" rules: - alert: "PrometheusJobMissing" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 1aaea88e7f5..4768af7de40 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -1886,3 +1886,147 @@ tests: documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash summary: One or more Ceph daemons have crashed, and are pending acknowledgement description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive ' command. + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_STORAGE"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareStorageError + - eval_time: 5m + alertname: HardwareStorageError + exp_alerts: + - exp_labels: + name: HARDWARE_STORAGE + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.1 + exp_annotations: + summary: Storage devices error(s) detected + description: "Some storage devices are in error. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_MEMORY"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareMemoryError + - eval_time: 5m + alertname: HardwareMemoryError + exp_alerts: + - exp_labels: + name: HARDWARE_MEMORY + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.2 + exp_annotations: + summary: DIMM error(s) detected + description: "DIMM error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareProcessorError + - eval_time: 5m + alertname: HardwareProcessorError + exp_alerts: + - exp_labels: + name: HARDWARE_PROCESSOR + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.3 + exp_annotations: + summary: Processor error(s) detected + description: "Processor error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_NETWORK"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareNetworkError + - eval_time: 5m + alertname: HardwareNetworkError + exp_alerts: + - exp_labels: + name: HARDWARE_NETWORK + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.4 + exp_annotations: + summary: Network error(s) detected + description: "Network error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_POWER"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_POWER"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwarePowerError + - eval_time: 5m + alertname: HardwarePowerError + exp_alerts: + - exp_labels: + name: HARDWARE_POWER + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.5 + exp_annotations: + summary: Power supply error(s) detected + description: "Power supply error(s) detected. Check `ceph health detail`." + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="HARDWARE_FANS"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="HARDWARE_FANS"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: HardwareFanError + - eval_time: 5m + alertname: HardwareFanError + exp_alerts: + - exp_labels: + name: HARDWARE_FANS + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.1.2.1.13.6 + exp_annotations: + summary: Fan error(s) detected + description: "Fan error(s) detected. Check `ceph health detail`." diff --git a/monitoring/snmp/README.md b/monitoring/snmp/README.md index 1a5b609556d..c96dffa3d68 100644 --- a/monitoring/snmp/README.md +++ b/monitoring/snmp/README.md @@ -40,6 +40,7 @@ internet private enterprise ceph ceph Notifications Prometheus Notific .10 (Rados) .11 (cephadm) .12 (prometheus) + .13 (hardware) ``` Individual alerts are placed within the appropriate alert category. For example, to add -- 2.39.5