]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-mixin: fix manually edited 'prometheus_alerts.yml' file
authorArun Kumar Mohan <amohan@redhat.com>
Sun, 2 Apr 2023 03:36:04 +0000 (09:06 +0530)
committerArun Kumar Mohan <amohan@redhat.com>
Thu, 28 Mar 2024 08:21:57 +0000 (13:51 +0530)
File 'prometheus_alerts.yml' file should not be edited directly.
The changes should be added to 'prometheus_alerts.libsonnet' file
(and/or any other appropriate lib/j sonnet files) and generated
using 'make generate' command.

Adding all the changes to 'prometheus_alerts.libsonnet' file and
building/generating the prometheus_alerts YAML file.

PS: all the changes seen in 'prometheus_alerts.yml' file is due
to the re-arrangement of lines. The file remains same.

Signed-off-by: Arun Kumar Mohan <amohan@redhat.com>
(cherry picked from commit e9d803d6080ee22f5912a6b748701aadde181c28)

monitoring/ceph-mixin/prometheus_alerts.libsonnet
monitoring/ceph-mixin/prometheus_alerts.yml

index a6ab4c2a3f909584e00df836eb4d62738e7c483e..b7558a70fa87e11f51ef5b556438861cb8e67768 100644 (file)
       rules: [
         {
           alert: 'CephPoolGrowthWarning',
-          expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id)    group_right ceph_pool_metadata) >= 95' % $.MultiClusterQuery(),
+          expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id, instance) group_right() ceph_pool_metadata) >= 95' % $.MultiClusterQuery(),
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.2' },
           annotations: {
             summary: 'Pool growth rate may soon exceed capacity%(cluster)s' % $.MultiClusterSummary(),
index e491c753f3c7db0529880e632e58170a9fe9bbe6..49c38ebd3555f9d2f2fb40507e6d7a59faf2fff6 100644 (file)
@@ -497,8 +497,8 @@ groups:
           type: "ceph_default"
       - alert: "CephNodeNetworkBondDegraded"
         annotations:
-          summary: "Degraded Bond on Node {{ $labels.instance }}"
           description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
+          summary: "Degraded Bond on Node {{ $labels.instance }}"
         expr: |
           node_bonding_slaves - node_bonding_active != 0
         labels:
@@ -573,15 +573,15 @@ groups:
           severity: "warning"
           type: "ceph_default"
       - alert: "CephDaemonSlowOps"
-        for: "30s"
-        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
-        labels: 
-          severity: 'warning'
-          type: 'ceph_default'
         annotations:
-          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
           description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
+          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
   - name: "cephadm"
     rules:
       - alert: "CephadmUpgradeFailed"