From: Arun Kumar Mohan Date: Sun, 2 Apr 2023 03:36:04 +0000 (+0530) Subject: ceph-mixin: fix manually edited 'prometheus_alerts.yml' file X-Git-Tag: v18.2.5~707^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=61d173316b3904a20faf99e774c24edd448258bf;p=ceph.git ceph-mixin: fix manually edited 'prometheus_alerts.yml' file File 'prometheus_alerts.yml' file should not be edited directly. The changes should be added to 'prometheus_alerts.libsonnet' file (and/or any other appropriate lib/j sonnet files) and generated using 'make generate' command. Adding all the changes to 'prometheus_alerts.libsonnet' file and building/generating the prometheus_alerts YAML file. PS: all the changes seen in 'prometheus_alerts.yml' file is due to the re-arrangement of lines. The file remains same. Signed-off-by: Arun Kumar Mohan (cherry picked from commit e9d803d6080ee22f5912a6b748701aadde181c28) --- diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index a6ab4c2a3f90..b7558a70fa87 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -587,7 +587,7 @@ rules: [ { alert: 'CephPoolGrowthWarning', - expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id) group_right ceph_pool_metadata) >= 95' % $.MultiClusterQuery(), + expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id, instance) group_right() ceph_pool_metadata) >= 95' % $.MultiClusterQuery(), labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.2' }, annotations: { summary: 'Pool growth rate may soon exceed capacity%(cluster)s' % $.MultiClusterSummary(), diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index e491c753f3c7..49c38ebd3555 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -497,8 +497,8 @@ groups: type: "ceph_default" - alert: "CephNodeNetworkBondDegraded" annotations: - summary: "Degraded Bond on Node {{ $labels.instance }}" description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}." + summary: "Degraded Bond on Node {{ $labels.instance }}" expr: | node_bonding_slaves - node_bonding_active != 0 labels: @@ -573,15 +573,15 @@ groups: severity: "warning" type: "ceph_default" - alert: "CephDaemonSlowOps" - for: "30s" - expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0" - labels: - severity: 'warning' - type: 'ceph_default' annotations: - summary: "{{ $labels.ceph_daemon }} operations are slow to complete" description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)" documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" + summary: "{{ $labels.ceph_daemon }} operations are slow to complete" + expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" - name: "cephadm" rules: - alert: "CephadmUpgradeFailed"