]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-mixin: fix manually edited 'prometheus_alerts.yml' file
authorArun Kumar Mohan <amohan@redhat.com>
Sun, 2 Apr 2023 03:36:04 +0000 (09:06 +0530)
committerArun Kumar Mohan <amohan@redhat.com>
Wed, 9 Aug 2023 06:49:04 +0000 (12:19 +0530)
File 'prometheus_alerts.yml' file should not be edited directly.
The changes should be added to 'prometheus_alerts.libsonnet' file
(and/or any other appropriate lib/j sonnet files) and generated
using 'make generate' command.

Adding all the changes to 'prometheus_alerts.libsonnet' file and
building/generating the prometheus_alerts YAML file.

PS: all the changes seen in 'prometheus_alerts.yml' file is due
to the re-arrangement of lines. The file remains same.

Signed-off-by: Arun Kumar Mohan <amohan@redhat.com>
monitoring/ceph-mixin/prometheus_alerts.libsonnet
monitoring/ceph-mixin/prometheus_alerts.yml

index b7ec0da2f04aab24936bf46c979eb32cd63137fe..6c34321df55628aeacd7ff714d2af0e744ee8f38 100644 (file)
       rules: [
         {
           alert: 'CephPoolGrowthWarning',
-          expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id)    group_right ceph_pool_metadata) >= 95' % $.MultiClusterQuery(),
+          expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id, instance) group_right() ceph_pool_metadata) >= 95' % $.MultiClusterQuery(),
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.2' },
           annotations: {
             summary: 'Pool growth rate may soon exceed capacity%(cluster)s' % $.MultiClusterSummary(),
index 4a3e6acf389d165e5305c2f9d6d6fc62e46c4344..408382cbeba8bbc4e7daa6ef43c485dfabc4dcb7 100644 (file)
@@ -497,8 +497,8 @@ groups:
           type: "ceph_default"
       - alert: "CephNodeNetworkBondDegraded"
         annotations:
-          summary: "Degraded Bond on Node {{ $labels.instance }}"
           description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
+          summary: "Degraded Bond on Node {{ $labels.instance }}"
         expr: |
           node_bonding_slaves - node_bonding_active != 0
         labels:
@@ -573,15 +573,15 @@ groups:
           severity: "warning"
           type: "ceph_default"
       - alert: "CephDaemonSlowOps"
-        for: "30s"
-        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
-        labels: 
-          severity: 'warning'
-          type: 'ceph_default'
         annotations:
-          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
           description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
+          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
   - name: "cephadm"
     rules:
       - alert: "CephadmUpgradeFailed"