]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
monitoring: use ceph_cephadm_daemon_status in ceph-nvmeof
authorVallari Agrawal <vallari.agrawal@ibm.com>
Wed, 13 Aug 2025 11:09:04 +0000 (16:39 +0530)
committerVallari Agrawal <vallari.agrawal@ibm.com>
Wed, 5 Nov 2025 08:29:03 +0000 (13:59 +0530)
Use this metric to show accurate data for "down"
NVMeoF gateways in ceph NVMeoF Overview dashboard.

Fixes: https://tracker.ceph.com/issues/71384
Signed-off-by: Vallari Agrawal <vallari.agrawal@ibm.com>
monitoring/ceph-mixin/dashboards/ceph-nvmeof.libsonnet
monitoring/ceph-mixin/dashboards_out/ceph-nvmeof.json

index 4002142ceccb9fc2ba4009585953a97af313a02c..8350af40c63261220bf2d792b33e3f156c95afff 100644 (file)
@@ -133,14 +133,14 @@ local g = import 'grafonnet/grafana.libsonnet';
       interval='1m',
       color={ mode: 'thresholds' },
       thresholdsMode='',
-      noValue=null,
+      noValue='0',
     ).addThresholds([
       { color: '#808080', value: null },
       { color: 'red', value: 1.0003 },
     ])
     .addTarget(
       $.addTargetSchema(
-        expr="count(ceph_nvmeof_gateway_info) + sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})",
+        expr='count(ceph_cephadm_daemon_status{service_type="nvmeof"})',
         format='time_series',
         instant=true,
         legendFormat='Total',
@@ -150,17 +150,17 @@ local g = import 'grafonnet/grafana.libsonnet';
     )
     .addTarget(
       $.addTargetSchema(
-        expr='count(ceph_nvmeof_gateway_info)',
+        expr='count(ceph_cephadm_daemon_status{service_type="nvmeof"}==1 or ceph_cephadm_daemon_status{service_type="nvmeof"}==2)',
         format='time_series',
-        instant=false,
+        instant=true,
         legendFormat='Available',
-        range=true,
+        range=false,
         datasource='$datasource',
       )
     )
     .addTarget(
       $.addTargetSchema(
-        expr="(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})",
+        expr='count(ceph_cephadm_daemon_status{service_type="nvmeof"}==0 or ceph_cephadm_daemon_status{service_type="nvmeof"}==-1 or ceph_cephadm_daemon_status{service_type="nvmeof"} == -2)',
         format='time_series',
         instant=true,
         legendFormat='Down',
@@ -208,8 +208,8 @@ local g = import 'grafonnet/grafana.libsonnet';
     ]),
 
     $.timeSeriesPanel(
-      title='Ceph Health NVMeoF WARNING',
-      description='Ceph healthchecks NVMeoF WARNINGs',
+      title='Unhealthy Gateway Trend',
+      description='Gateways in error states',
       gridPosition={ x: 8, y: 1, w: 7, h: 8 },
       lineInterpolation='linear',
       lineWidth=1,
@@ -221,89 +221,63 @@ local g = import 'grafonnet/grafana.libsonnet';
       showPoints='auto',
       unit='none',
       displayMode='list',
-      showLegend=true,
+      showLegend=false,
       placement='bottom',
-      tooltip={ mode: 'multi', sort: 'desc' },
+      tooltip={ hideZeros: true, mode: 'multi', sort: 'desc' },
       stackingMode='normal',
       spanNulls=false,
       decimals=0,
       thresholdsMode='absolute',
-      noValue='0',
+      noValue=0,
     ).addThresholds([
       { color: 'green', value: null },
     ])
     .addTarget(
       $.addTargetSchema(
-        expr="sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})",
+        expr='group by (daemon_name) (ceph_cephadm_daemon_status{service_type="nvmeof"} == 0)',
         format='',
         instant=false,
-        legendFormat='NVMEOF_GATEWAY_DOWN',
+        legendFormat='stopped - {{ daemon_name }} ',
         range=true,
         datasource='$datasource',
       )
     )
     .addTarget(
       $.addTargetSchema(
-        expr="sum(ceph_health_detail{name='NVMEOF_GATEWAY_DELETING'})",
+        expr='group by (daemon_name) (ceph_cephadm_daemon_status{service_type="nvmeof"} == -1)',
         format='',
         instant=false,
-        legendFormat='NVMEOF_GATEWAY_DELETING',
+        legendFormat='error - {{ daemon_name }}',
         range=true,
         datasource='$datasource',
       )
     )
     .addTarget(
       $.addTargetSchema(
-        expr="sum(ceph_health_detail{name='NVMEOF_SINGLE_GATEWAY'})",
+        expr='group by (daemon_name) (ceph_cephadm_daemon_status{service_type="nvmeof"} == -2)',
         format='',
         instant=false,
-        legendFormat='NVMEOF_SINGLE_GATEWAY',
+        legendFormat='unknown_state - {{ daemon_name }}',
         range=true,
         datasource='$datasource',
       )
     )
-    .addOverrides([
-      {
-        matcher: { id: 'byName', options: 'NVMEOF_GATEWAY_DOWN' },
-        properties: [
-          {
-            id: 'color',
-            value: {
-              fixedColor: 'red',
-              mode: 'fixed',
-            },
-          },
-        ],
-      },
-      {
-        matcher: { id: 'byName', options: 'NVMEOF_GATEWAY_DELETING' },
-        properties: [
-          {
-            id: 'color',
-            value: {
-              fixedColor: 'dark-purple',
-              mode: 'fixed',
-            },
-          },
-        ],
-      },
-      {
-        matcher: { id: 'byName', options: 'NVMEOF_SINGLE_GATEWAY' },
-        properties: [
-          {
-            id: 'custom.lineWidth',
-            value: 1,
-          },
-          {
-            id: 'color',
-            value: {
-              fixedColor: 'super-light-orange',
-              mode: 'shades',
+    .addOverrides(
+      [
+        {
+          matcher: { id: 'byType', options: 'number' },
+          properties: [
+            {
+              id: 'color',
+              value: {
+                fixedColor: 'orange',
+                mode: 'shades',
+              },
             },
-          },
-        ],
-      },
-    ]),
+          ],
+        },
+      ]
+    ),
 
     $.addAlertListPanel(
       title='Active Alerts',
index 2112d7a75404da07301c2ae295a55c8dad398129..fc5859d212cb3217e4057b04db52b74f2d7fc141 100644 (file)
                "decimals": 0,
                "links": [ ],
                "mappings": [ ],
+               "noValue": "0",
                "thresholds": {
                   "mode": "",
                   "steps": [
          "targets": [
             {
                "datasource": "$datasource",
-               "expr": "count(ceph_nvmeof_gateway_info) + sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})",
+               "expr": "count(ceph_cephadm_daemon_status{service_type=\"nvmeof\"})",
                "format": "time_series",
                "instant": true,
                "intervalFactor": 1,
             },
             {
                "datasource": "$datasource",
-               "expr": "count(ceph_nvmeof_gateway_info)",
+               "expr": "count(ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==1 or ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==2)",
                "format": "time_series",
-               "instant": false,
+               "instant": true,
                "intervalFactor": 1,
                "legendFormat": "Available",
-               "range": true,
+               "range": false,
                "refId": "B"
             },
             {
                "datasource": "$datasource",
-               "expr": "(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})",
+               "expr": "count(ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==0 or ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==-1 or ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == -2)",
                "format": "time_series",
                "instant": true,
                "intervalFactor": 1,
       },
       {
          "datasource": "$datasource",
-         "description": "Ceph healthchecks NVMeoF WARNINGs",
+         "description": "Gateways in error states",
          "fieldConfig": {
             "defaults": {
                "color": {
                   }
                },
                "decimals": 0,
-               "noValue": "0",
+               "mappings": [ ],
+               "noValue": 0,
                "thresholds": {
                   "mode": "absolute",
                   "steps": [
             "overrides": [
                {
                   "matcher": {
-                     "id": "byName",
-                     "options": "NVMEOF_GATEWAY_DOWN"
-                  },
-                  "properties": [
-                     {
-                        "id": "color",
-                        "value": {
-                           "fixedColor": "red",
-                           "mode": "fixed"
-                        }
-                     }
-                  ]
-               },
-               {
-                  "matcher": {
-                     "id": "byName",
-                     "options": "NVMEOF_GATEWAY_DELETING"
-                  },
-                  "properties": [
-                     {
-                        "id": "color",
-                        "value": {
-                           "fixedColor": "dark-purple",
-                           "mode": "fixed"
-                        }
-                     }
-                  ]
-               },
-               {
-                  "matcher": {
-                     "id": "byName",
-                     "options": "NVMEOF_SINGLE_GATEWAY"
+                     "id": "byType",
+                     "options": "number"
                   },
                   "properties": [
-                     {
-                        "id": "custom.lineWidth",
-                        "value": 1
-                     },
                      {
                         "id": "color",
                         "value": {
-                           "fixedColor": "super-light-orange",
+                           "fixedColor": "orange",
                            "mode": "shades"
                         }
                      }
                "calcs": [ ],
                "displayMode": "list",
                "placement": "bottom",
-               "showLegend": true
+               "showLegend": false
             },
             "tooltip": {
+               "hideZeros": true,
                "mode": "multi",
                "sort": "desc"
             }
          "targets": [
             {
                "datasource": "$datasource",
-               "expr": "sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})",
+               "expr": "group by (daemon_name) (ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == 0)",
                "format": "",
                "instant": false,
                "intervalFactor": 1,
-               "legendFormat": "NVMEOF_GATEWAY_DOWN",
+               "legendFormat": "stopped - {{ daemon_name }} ",
                "range": true,
                "refId": "A"
             },
             {
                "datasource": "$datasource",
-               "expr": "sum(ceph_health_detail{name='NVMEOF_GATEWAY_DELETING'})",
+               "expr": "group by (daemon_name) (ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == -1)",
                "format": "",
                "instant": false,
                "intervalFactor": 1,
-               "legendFormat": "NVMEOF_GATEWAY_DELETING",
+               "legendFormat": "error - {{ daemon_name }}",
                "range": true,
                "refId": "B"
             },
             {
                "datasource": "$datasource",
-               "expr": "sum(ceph_health_detail{name='NVMEOF_SINGLE_GATEWAY'})",
+               "expr": "group by (daemon_name) (ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == -2)",
                "format": "",
                "instant": false,
                "intervalFactor": 1,
-               "legendFormat": "NVMEOF_SINGLE_GATEWAY",
+               "legendFormat": "unknown_state - {{ daemon_name }}",
                "range": true,
                "refId": "C"
             }
          ],
-         "title": "Ceph Health NVMeoF WARNING",
+         "title": "Unhealthy Gateway Trend",
          "type": "timeseries"
       },
       {
                   }
                },
                "decimals": 0,
+               "mappings": [ ],
                "noValue": "0",
                "thresholds": {
                   "mode": "absolute",