From 1fa6af9ed243f326a22a24478803f71daf27cc26 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Wed, 13 Aug 2025 16:39:04 +0530 Subject: [PATCH] monitoring: use ceph_cephadm_daemon_status in ceph-nvmeof Use this metric to show accurate data for "down" NVMeoF gateways in ceph NVMeoF Overview dashboard. Fixes: https://tracker.ceph.com/issues/71384 Signed-off-by: Vallari Agrawal --- .../dashboards/ceph-nvmeof.libsonnet | 317 ++++++++---------- .../dashboards_out/ceph-nvmeof.json | 74 ++-- 2 files changed, 167 insertions(+), 224 deletions(-) diff --git a/monitoring/ceph-mixin/dashboards/ceph-nvmeof.libsonnet b/monitoring/ceph-mixin/dashboards/ceph-nvmeof.libsonnet index 4002142cecc..44d271e1646 100644 --- a/monitoring/ceph-mixin/dashboards/ceph-nvmeof.libsonnet +++ b/monitoring/ceph-mixin/dashboards/ceph-nvmeof.libsonnet @@ -119,191 +119,164 @@ local g = import 'grafonnet/grafana.libsonnet'; ) ), - $.addStatPanel( - title='Total Gateways', - description='', - unit='', - datasource='$datasource', - gridPosition={ x: 3, y: 1, w: 5, h: 3 }, - colorMode='background', - graphMode='none', - justifyMode='auto', - orientation='auto', - textMode='auto', - interval='1m', - color={ mode: 'thresholds' }, - thresholdsMode='', - noValue=null, - ).addThresholds([ - { color: '#808080', value: null }, - { color: 'red', value: 1.0003 }, - ]) - .addTarget( - $.addTargetSchema( - expr="count(ceph_nvmeof_gateway_info) + sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})", - format='time_series', - instant=true, - legendFormat='Total', - range=false, + $.addStatPanel( + title='Total Gateways', + description='', + unit='', datasource='$datasource', + gridPosition={ x: 3, y: 1, w: 5, h: 3 }, + colorMode="background", + graphMode="none", + justifyMode="auto", + orientation="auto", + textMode="auto", + interval='1m', + color={ mode: 'thresholds' }, + thresholdsMode='', + noValue="0", + ).addThresholds([ + { color: '#808080', value: null }, + { color: 'red', value: 1.0003 }, + ]) + .addTarget( + $.addTargetSchema( + expr='count(ceph_cephadm_daemon_status{service_type="nvmeof"})', + format='time_series', + instant=true, + legendFormat="Total", + range=false, + datasource='$datasource', + ) ) - ) - .addTarget( - $.addTargetSchema( - expr='count(ceph_nvmeof_gateway_info)', - format='time_series', - instant=false, - legendFormat='Available', - range=true, - datasource='$datasource', + .addTarget( + $.addTargetSchema( + expr='count(ceph_cephadm_daemon_status{service_type="nvmeof"}==1 or ceph_cephadm_daemon_status{service_type="nvmeof"}==2)', + format='time_series', + instant=true, + legendFormat="Available", + range=false, + datasource='$datasource', + ) ) - ) - .addTarget( - $.addTargetSchema( - expr="(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})", - format='time_series', - instant=true, - legendFormat='Down', - range=false, - datasource='$datasource', + .addTarget( + $.addTargetSchema( + expr='count(ceph_cephadm_daemon_status{service_type="nvmeof"}==0 or ceph_cephadm_daemon_status{service_type="nvmeof"}==-1 or ceph_cephadm_daemon_status{service_type="nvmeof"} == -2)', + format='time_series', + instant=true, + legendFormat="Down", + range=false, + datasource='$datasource', + ) ) - ) - .addOverrides([ - { - matcher: { id: 'byName', options: 'Down' }, - properties: [ - { - id: 'color', - value: { - fixedColor: 'red', - mode: 'fixed', + .addOverrides([ + { + matcher: { id: 'byName', options: 'Down' }, + properties: [ + { + id: 'color', + value: { + fixedColor: 'red', + mode: 'fixed' + }, }, - }, - ], - }, - { - matcher: { id: 'byName', options: 'Total' }, - properties: [ - { - id: 'color', - value: { - fixedColor: '#a7a38b', - mode: 'fixed', + ], + }, + { + matcher: { id: 'byName', options: 'Total' }, + properties: [ + { + id: 'color', + value: { + fixedColor: '#a7a38b', + mode: 'fixed' + }, }, - }, - ], - }, - { - matcher: { id: 'byName', options: 'Available' }, - properties: [ - { - id: 'color', - value: { - fixedColor: 'green', - mode: 'fixed', + ], + }, + { + matcher: { id: 'byName', options: 'Available' }, + properties: [ + { + id: 'color', + value: { + fixedColor: 'green', + mode: 'fixed' + }, }, - }, - ], - }, - ]), + ], + }, + ]), - $.timeSeriesPanel( - title='Ceph Health NVMeoF WARNING', - description='Ceph healthchecks NVMeoF WARNINGs', - gridPosition={ x: 8, y: 1, w: 7, h: 8 }, - lineInterpolation='linear', - lineWidth=1, - drawStyle='line', - axisPlacement='auto', - datasource='$datasource', - fillOpacity=5, - pointSize=5, - showPoints='auto', - unit='none', - displayMode='list', - showLegend=true, - placement='bottom', - tooltip={ mode: 'multi', sort: 'desc' }, - stackingMode='normal', - spanNulls=false, - decimals=0, - thresholdsMode='absolute', - noValue='0', - ).addThresholds([ - { color: 'green', value: null }, - ]) - .addTarget( - $.addTargetSchema( - expr="sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})", - format='', - instant=false, - legendFormat='NVMEOF_GATEWAY_DOWN', - range=true, + $.timeSeriesPanel( + title='Gateway WARNING Health States', + description='Gateways in error states', + gridPosition={ x: 8, y: 1, w: 7, h: 8 }, + lineInterpolation='linear', + lineWidth=1, + drawStyle='line', + axisPlacement='auto', datasource='$datasource', + fillOpacity=5, + pointSize=5, + showPoints='auto', + unit='none', + displayMode='list', + showLegend=false, + placement='bottom', + tooltip={'hideZeros': true, 'mode': 'multi', 'sort': 'desc'}, + stackingMode='normal', + spanNulls=false, + decimals=0, + thresholdsMode='absolute', + noValue=0, + ).addThresholds([ + { color: 'green', value: null }, + ]) + .addTarget( + $.addTargetSchema( + expr='group by (daemon_name) (ceph_cephadm_daemon_status{service_type="nvmeof"} == 0)', + format='', + instant=false, + legendFormat="stopped - {{ daemon_name }} ", + range=true, + datasource='$datasource', + ) ) - ) - .addTarget( - $.addTargetSchema( - expr="sum(ceph_health_detail{name='NVMEOF_GATEWAY_DELETING'})", - format='', - instant=false, - legendFormat='NVMEOF_GATEWAY_DELETING', - range=true, - datasource='$datasource', + .addTarget( + $.addTargetSchema( + expr='group by (daemon_name) (ceph_cephadm_daemon_status{service_type="nvmeof"} == -1)', + format='', + instant=false, + legendFormat="error - {{ daemon_name }}", + range=true, + datasource='$datasource', + ) ) - ) - .addTarget( - $.addTargetSchema( - expr="sum(ceph_health_detail{name='NVMEOF_SINGLE_GATEWAY'})", - format='', - instant=false, - legendFormat='NVMEOF_SINGLE_GATEWAY', - range=true, - datasource='$datasource', + .addTarget( + $.addTargetSchema( + expr='group by (daemon_name) (ceph_cephadm_daemon_status{service_type="nvmeof"} == -2)', + format='', + instant=false, + legendFormat="unknown_state - {{ daemon_name }}", + range=true, + datasource='$datasource', + ) ) - ) - .addOverrides([ - { - matcher: { id: 'byName', options: 'NVMEOF_GATEWAY_DOWN' }, - properties: [ - { - id: 'color', - value: { - fixedColor: 'red', - mode: 'fixed', - }, - }, - ], - }, - { - matcher: { id: 'byName', options: 'NVMEOF_GATEWAY_DELETING' }, - properties: [ - { - id: 'color', - value: { - fixedColor: 'dark-purple', - mode: 'fixed', + .addOverrides([ + { + matcher: { id: 'byType', options: 'number' }, + properties: [ + { + id: 'color', + value: { + fixedColor: 'orange', + mode: 'shades' + }, }, - }, - ], - }, - { - matcher: { id: 'byName', options: 'NVMEOF_SINGLE_GATEWAY' }, - properties: [ - { - id: 'custom.lineWidth', - value: 1, - }, - { - id: 'color', - value: { - fixedColor: 'super-light-orange', - mode: 'shades', - }, - }, - ], - }, - ]), + ], + } + ] + ), $.addAlertListPanel( title='Active Alerts', diff --git a/monitoring/ceph-mixin/dashboards_out/ceph-nvmeof.json b/monitoring/ceph-mixin/dashboards_out/ceph-nvmeof.json index 2112d7a7540..739668beb89 100644 --- a/monitoring/ceph-mixin/dashboards_out/ceph-nvmeof.json +++ b/monitoring/ceph-mixin/dashboards_out/ceph-nvmeof.json @@ -138,6 +138,7 @@ "decimals": 0, "links": [ ], "mappings": [ ], + "noValue": "0", "thresholds": { "mode": "", "steps": [ @@ -228,7 +229,7 @@ "targets": [ { "datasource": "$datasource", - "expr": "count(ceph_nvmeof_gateway_info) + sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})", + "expr": "count(ceph_cephadm_daemon_status{service_type=\"nvmeof\"})", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -238,17 +239,17 @@ }, { "datasource": "$datasource", - "expr": "count(ceph_nvmeof_gateway_info)", + "expr": "count(ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==1 or ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==2)", "format": "time_series", - "instant": false, + "instant": true, "intervalFactor": 1, "legendFormat": "Available", - "range": true, + "range": false, "refId": "B" }, { "datasource": "$datasource", - "expr": "(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})", + "expr": "count(ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==0 or ceph_cephadm_daemon_status{service_type=\"nvmeof\"}==-1 or ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == -2)", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -263,7 +264,7 @@ }, { "datasource": "$datasource", - "description": "Ceph healthchecks NVMeoF WARNINGs", + "description": "Gateways in error states", "fieldConfig": { "defaults": { "color": { @@ -300,7 +301,8 @@ } }, "decimals": 0, - "noValue": "0", + "mappings": [ ], + "noValue": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -315,48 +317,14 @@ "overrides": [ { "matcher": { - "id": "byName", - "options": "NVMEOF_GATEWAY_DOWN" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "NVMEOF_GATEWAY_DELETING" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "dark-purple", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "NVMEOF_SINGLE_GATEWAY" + "id": "byType", + "options": "number" }, "properties": [ - { - "id": "custom.lineWidth", - "value": 1 - }, { "id": "color", "value": { - "fixedColor": "super-light-orange", + "fixedColor": "orange", "mode": "shades" } } @@ -376,9 +344,10 @@ "calcs": [ ], "displayMode": "list", "placement": "bottom", - "showLegend": true + "showLegend": false }, "tooltip": { + "hideZeros": true, "mode": "multi", "sort": "desc" } @@ -388,36 +357,36 @@ "targets": [ { "datasource": "$datasource", - "expr": "sum(ceph_health_detail{name='NVMEOF_GATEWAY_DOWN'})", + "expr": "group by (daemon_name) (ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == 0)", "format": "", "instant": false, "intervalFactor": 1, - "legendFormat": "NVMEOF_GATEWAY_DOWN", + "legendFormat": "stopped - {{ daemon_name }} ", "range": true, "refId": "A" }, { "datasource": "$datasource", - "expr": "sum(ceph_health_detail{name='NVMEOF_GATEWAY_DELETING'})", + "expr": "group by (daemon_name) (ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == -1)", "format": "", "instant": false, "intervalFactor": 1, - "legendFormat": "NVMEOF_GATEWAY_DELETING", + "legendFormat": "error - {{ daemon_name }}", "range": true, "refId": "B" }, { "datasource": "$datasource", - "expr": "sum(ceph_health_detail{name='NVMEOF_SINGLE_GATEWAY'})", + "expr": "group by (daemon_name) (ceph_cephadm_daemon_status{service_type=\"nvmeof\"} == -2)", "format": "", "instant": false, "intervalFactor": 1, - "legendFormat": "NVMEOF_SINGLE_GATEWAY", + "legendFormat": "unknown_state - {{ daemon_name }}", "range": true, "refId": "C" } ], - "title": "Ceph Health NVMeoF WARNING", + "title": "Gateway WARNING Health States", "type": "timeseries" }, { @@ -1048,6 +1017,7 @@ } }, "decimals": 0, + "mappings": [ ], "noValue": "0", "thresholds": { "mode": "absolute", -- 2.39.5