]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
monitoring/grafana,prometheus: add per-pool pg states support
authorAleksei Zakharov <zaharov@selectel.ru>
Tue, 21 Jan 2020 10:44:50 +0000 (13:44 +0300)
committerAleksei Zakharov <zaharov@selectel.ru>
Wed, 29 Jan 2020 14:28:36 +0000 (17:28 +0300)
Signed-off-by: Aleksei Zakharov <zaharov@selectel.ru>
monitoring/grafana/dashboards/ceph-cluster.json
monitoring/prometheus/alerts/ceph_default_alerts.yml

index 2fcee528d24c0baf33c0616e22a5abed8143d5a6..93fe3372c6c503e68b436fd2d8edae9488c72139 100644 (file)
       },
       "id": 53,
       "legend": {
+        "alignAsTable": true,
         "avg": false,
         "current": false,
         "max": false,
       "steppedLine": false,
       "targets": [
         {
-          "expr": "ceph_pg_total",
+          "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total)",
           "format": "time_series",
           "intervalFactor": 1,
-          "legendFormat": "Total",
+          "legendFormat": "{{name}} Total",
           "refId": "A"
         },
         {
-          "expr": "ceph_pg_active",
+          "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_active)",
           "format": "time_series",
           "intervalFactor": 1,
-          "legendFormat": "Active",
+          "legendFormat": "{{name}} Active",
           "refId": "B"
         },
         {
-          "expr": "ceph_pg_total - ceph_pg_active",
+          "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active)",
           "format": "time_series",
           "intervalFactor": 1,
-          "legendFormat": "Inactive",
+          "legendFormat": "{{name}} Inactive",
           "refId": "G"
         },
         {
-          "expr": "ceph_pg_undersized",
+          "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_undersized)",
           "format": "time_series",
           "intervalFactor": 1,
-          "legendFormat": "Undersized",
+          "legendFormat": "{{name}} Undersized",
           "refId": "F"
         },
         {
-          "expr": "ceph_pg_degraded",
+          "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_degraded)",
           "format": "time_series",
           "intervalFactor": 1,
-          "legendFormat": "Degraded",
+          "legendFormat": "{{name}} Degraded",
           "refId": "C"
         },
         {
-          "expr": "ceph_pg_inconsistent",
+          "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_inconsistent)",
           "format": "time_series",
           "intervalFactor": 1,
-          "legendFormat": "Inconsistent",
+          "legendFormat": "{{name}} Inconsistent",
           "refId": "D"
         },
         {
-          "expr": "ceph_pg_down",
+          "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_down)",
           "format": "time_series",
           "intervalFactor": 1,
-          "legendFormat": "Down",
+          "legendFormat": "{{name}} Down",
           "refId": "E"
         }
       ],
index 716ccc935df56f6e9b93aeb313548775fb7e4ae6..3f58aeeaeba1fa036a877d42cedbbaceaf176820 100644 (file)
@@ -139,7 +139,7 @@ groups:
   - name: pgs
     rules:
       - alert: pgs inactive
-        expr: ceph_pg_total - ceph_pg_active > 0
+        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
         for: 5m
         labels:
           severity: critical
@@ -147,11 +147,11 @@ groups:
           oid: 1.3.6.1.4.1.50495.15.1.2.7.1
         annotations:
           description: >
-            {{ $value }} PGs have been inactive for more than 5 minutes.
+            {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
             Inactive placement groups aren't able to serve read/write
             requests.
       - alert: pgs unclean
-        expr: ceph_pg_total - ceph_pg_clean > 0
+        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
         for: 15m
         labels:
           severity: warning
@@ -159,7 +159,7 @@ groups:
           oid: 1.3.6.1.4.1.50495.15.1.2.7.2
         annotations:
           description: >
-            {{ $value }} PGs haven't been clean for more than 15 minutes.
+            {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
             Unclean PGs haven't been able to completely recover from a
             previous failure.
   - name: nodes