]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/dashboard: Fix CephPoolGrowthWarning alert 49477/head
authorAashish Sharma <aasharma@redhat.com>
Tue, 18 Oct 2022 11:12:32 +0000 (16:42 +0530)
committerAashish Sharma <aasharma@redhat.com>
Fri, 16 Dec 2022 11:12:33 +0000 (16:42 +0530)
Prometheus reports an error - many-to-many matching not allowed: matching labels must be unique on one side for CephPoolGrowthWarning if we have same pool ids on two different instances.

Fixes: https://tracker.ceph.com/issues/58017
Signed-off-by: Aashish Sharma <aasharma@redhat.com>
(cherry picked from commit 97189b66afd4623ae09bc4ba12e1af6f69821793)

monitoring/ceph-mixin/prometheus_alerts.yml
monitoring/ceph-mixin/tests_alerts/test_alerts.yml

index a544d41eb0ee0ac15806a52c93ea5c36856c20cf..bd773f27c54023105dfd2e85e686513554554e73 100644 (file)
@@ -518,7 +518,7 @@ groups:
         annotations:
           description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
           summary: "Pool growth rate may soon exceed capacity"
-        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)    group_right ceph_pool_metadata) >= 95"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
           severity: "warning"
index 7b7e7db7301bddc9f0d55e2ca2b56198fb89652c..8cdb563493603c467caacd0addad0eaac645382b 100644 (file)
@@ -1472,35 +1472,44 @@ tests:
    # trigger percent full prediction on pools 1 and 2 only
  - interval: 12h
    input_series:
-    - series: 'ceph_pool_percent_used{pool_id="1"}'
-      values: '70 75 80 87 92'
-    - series: 'ceph_pool_percent_used{pool_id="2"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
+      values: '78 89 79 98 78'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
       values: '22 22 23 23 24'
-    - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
       values: '1 1 1 1 1'
    promql_expr_test:
      - expr: |
-         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
-              group_right ceph_pool_metadata) >= 95
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
+              group_right() ceph_pool_metadata) >= 95
        eval_time: 36h
        exp_samples:
-         - labels: '{name="rbd",pool_id="1",type="replicated"}'
-           value: 1.424E+02 # 142%
+         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
+           value: 1.435E+02 # 142%
    alert_rule_test:
     - eval_time: 48h
       alertname: CephPoolGrowthWarning
       exp_alerts:
       - exp_labels:
-          name: rbd
+          instance: 8090
+          name: default.rgw.index
           pool_id: 1
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
         exp_annotations:
           summary: Pool growth rate may soon exceed capacity
-          description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
+          description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
  - interval: 1m
    input_series:
     - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'