From d22f1d281d86a5d521208e84763bd51036372a78 Mon Sep 17 00:00:00 2001 From: Patrick Seidensal Date: Tue, 15 Jun 2021 14:43:50 +0200 Subject: [PATCH] monitoring: remove instance label from ceph-cluster.json completely The `instance` label is only useful if - the exporter returns only data about its node or instance - the exporter provides an instance label and then may return data about other nodes In this case, it's about the Prometheus mgr module, which is a single exporter providing data about a whole cluster, so not only data related to the node (or instance) the mgr module is running on. It is completely irrelevant on which node the exporter runs on, the data provided doesn't change. The exporter also doesn't provide `instance` labels (which Prometheus wouldn't change due to our configuration, see "honor_labels" setting). (Actually there's one exception where `instance` labels are provided by the Ceph mgr module, but that doesn't affect the Ceph Cluster dashboard.) Note that keeping that instance label on this particular dashboard would enable the user to switch between a previously failed mgr instance and the data collected from there and the currently running mgr instance (on which the Prometheus mgr module runs on). That'd split the data, which I don't think is a useful feature, but rather looks broken. Fixes: https://tracker.ceph.com/issues/51212 Signed-off-by: Patrick Seidensal (cherry picked from commit 037410713f032c0a2a25243e411ae67dffcc1d1a) --- .../grafana/dashboards/ceph-cluster.json | 63 ++++++------------- 1 file changed, 20 insertions(+), 43 deletions(-) diff --git a/monitoring/grafana/dashboards/ceph-cluster.json b/monitoring/grafana/dashboards/ceph-cluster.json index 5603b064af68..61a425d09f2e 100644 --- a/monitoring/grafana/dashboards/ceph-cluster.json +++ b/monitoring/grafana/dashboards/ceph-cluster.json @@ -107,7 +107,7 @@ "tableColumn": "", "targets": [ { - "expr": "ceph_health_status{instance=~'$instance'}", + "expr": "ceph_health_status", "format": "time_series", "instant": true, "interval": "$interval", @@ -175,7 +175,7 @@ "displayAliasType": "Always", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "count(ceph_osd_metadata{instance=~\"$instance\"})", + "expr": "count(ceph_osd_metadata)", "format": "time_series", "intervalFactor": 1, "legendFormat": "All", @@ -190,7 +190,7 @@ "displayAliasType": "Always", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "sum(ceph_osds_in{instance=~\"$instance\"})", + "expr": "sum(ceph_osds_in)", "format": "time_series", "intervalFactor": 1, "legendFormat": "In", @@ -205,7 +205,7 @@ "displayAliasType": "Warning / Critical", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "sum(ceph_osd_in{instance=~\"$instance\"} == bool 0)", + "expr": "sum(ceph_osd_in == bool 0)", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -222,7 +222,7 @@ "displayAliasType": "Always", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "sum(ceph_osd_up{instance=~\"$instance\"})", + "expr": "sum(ceph_osd_up)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Up", @@ -238,7 +238,7 @@ "displayAliasType": "Warning / Critical", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "sum(ceph_osd_up{instance=~\"$instance\"} == bool 0)", + "expr": "sum(ceph_osd_up == bool 0)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Down", @@ -313,7 +313,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ceph_osd_stat_bytes_used{instance=~\"$instance\"})/sum(ceph_osd_stat_bytes{instance=~\"$instance\"})", + "expr": "sum(ceph_osd_stat_bytes_used)/sum(ceph_osd_stat_bytes)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used", @@ -531,28 +531,28 @@ "steppedLine": false, "targets": [ { - "expr": "quantile(0.95, ceph_osd_apply_latency_ms{instance=~\"$instance\"})", + "expr": "quantile(0.95, ceph_osd_apply_latency_ms)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Apply Latency P_95", "refId": "A" }, { - "expr": "quantile(0.95, ceph_osd_commit_latency_ms{instance=~\"$instance\"})", + "expr": "quantile(0.95, ceph_osd_commit_latency_ms)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Commit Latency P_95", "refId": "B" }, { - "expr": "avg(ceph_osd_apply_latency_ms{instance=~\"$instance\"})", + "expr": "avg(ceph_osd_apply_latency_ms)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Avg Apply Latency", "refId": "C" }, { - "expr": "avg(ceph_osd_commit_latency_ms{instance=~\"$instance\"})", + "expr": "avg(ceph_osd_commit_latency_ms)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Avg Commit Latency", @@ -630,7 +630,7 @@ "displayAliasType": "Always", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "sum(ceph_mon_quorum_status{instance=~\"$instance\"})", + "expr": "sum(ceph_mon_quorum_status)", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -647,7 +647,7 @@ "displayAliasType": "Always", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"})", + "expr": "count(ceph_mon_quorum_status)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Total", @@ -664,7 +664,7 @@ "displayAliasType": "Warning / Critical", "displayType": "Annotation", "displayValueWithAlias": "Never", - "expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"}) / sum(ceph_mon_quorum_status{instance=~\"$instance\"})", + "expr": "count(ceph_mon_quorum_status) / sum(ceph_mon_quorum_status)", "format": "time_series", "intervalFactor": 1, "legendFormat": "MONs out of Quorum", @@ -711,7 +711,7 @@ "displayAliasType": "Always", "displayType": "Regular", "displayValueWithAlias": "When Alias Displayed", - "expr": "ceph_mds_server_handle_client_session{instance=~\"$instance\"}", + "expr": "ceph_mds_server_handle_client_session", "format": "time_series", "intervalFactor": 1, "legendFormat": "Clients", @@ -765,14 +765,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(irate(ceph_osd_op_w_in_bytes{instance=~\"$instance\"}[1m]))", + "expr": "sum(irate(ceph_osd_op_w_in_bytes[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Writes", "refId": "A" }, { - "expr": "sum(irate(ceph_osd_op_r_out_bytes{instance=~\"$instance\"}[1m]))", + "expr": "sum(irate(ceph_osd_op_r_out_bytes[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Reads", @@ -852,7 +852,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(deriv(ceph_pool_stored{instance=~\"$instance\"}[1m]))", + "expr": "sum(deriv(ceph_pool_stored[1m]))", "format": "time_series", "intervalFactor": 1, "refId": "A" @@ -925,7 +925,7 @@ "span": 12, "targets": [ { - "expr": "ceph_osd_stat_bytes_used{instance=~'$instance'} / ceph_osd_stat_bytes{instance=~'$instance'}", + "expr": "ceph_osd_stat_bytes_used / ceph_osd_stat_bytes", "format": "time_series", "interval": "1m", "intervalFactor": 1, @@ -987,7 +987,7 @@ "links": [], "targets": [ { - "expr": "ceph_osd_numpg{instance=~\"$instance\"}", + "expr": "ceph_osd_numpg", "format": "time_series", "intervalFactor": 1, "legendFormat": "#PGs", @@ -1191,29 +1191,6 @@ "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", "refresh": 2, "type": "interval" - }, - { - "allFormat": "glob", - "allValue": null, - "current": {}, - "datasource": "$datasource", - "hide": 0, - "hideLabel": false, - "includeAll": true, - "label": "Exporter Instance", - "multi": false, - "multiFormat": "glob", - "name": "instance", - "options": [], - "query": "label_values(ceph_health_status, instance)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false } ] }, -- 2.47.3