From 666f8faf11c0a639ce6ead06026850dd72a14d41 Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Thu, 28 Nov 2024 11:28:59 +0530 Subject: [PATCH] mgr/dashboard: Add ceph_daemon filter to rgw overview grafana panel queries Currently rgw_servers filtering is not working in RGW Overview garfana graphs. It is showing data of all the RGW services, even though filter set to single service. This PR intends to solve this issue Fixes: https://tracker.ceph.com/issues/69074 Signed-off-by: Aashish Sharma --- monitoring/ceph-mixin/dashboards/rgw.libsonnet | 12 ++++++------ .../ceph-mixin/dashboards_out/radosgw-overview.json | 12 ++++++------ .../features/radosgw_overview.feature | 6 ++++++ 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet index 79a4b7a14eb0..c0c548b79c86 100644 --- a/monitoring/ceph-mixin/dashboards/rgw.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -298,7 +298,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -314,7 +314,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -331,7 +331,7 @@ local g = import 'grafonnet/grafana.libsonnet'; sum by (rgw_host) ( label_replace( rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ) @@ -351,7 +351,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_get_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_get_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -385,7 +385,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace(sum by (instance_id) ( rate(ceph_rgw_op_get_obj_bytes{%(matchers)s}[$__rate_interval]) + rate(ceph_rgw_op_put_obj_bytes{%(matchers)s}[$__rate_interval])) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -404,7 +404,7 @@ local g = import 'grafonnet/grafana.libsonnet'; label_replace( rate(ceph_rgw_op_put_obj_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_op_put_obj_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 5e185b63b7f4..5bf8279c27ce 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -108,14 +108,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET {{rgw_host}}", "refId": "A" }, { - "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT {{rgw_host}}", @@ -210,7 +210,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", + "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -305,7 +305,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -502,7 +502,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -597,7 +597,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature index 8d96dcdd6107..a34d57594378 100644 --- a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature +++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature @@ -7,6 +7,7 @@ Scenario: "Test Average GET Latencies" | ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `GET {{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 | @@ -18,6 +19,7 @@ Scenario: "Test Average PUT Latencies" | ceph_rgw_op_put_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 10 30 50 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `PUT {{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 | @@ -28,6 +30,7 @@ Scenario: "Test Total Requests/sec by RGW Instance" | ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 10 50 100 | | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.1` Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {rgw_host="1"} | 1.5 | @@ -39,6 +42,7 @@ Scenario: "Test GET Latencies by RGW Instance" | ceph_rgw_op_get_obj_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 20 60 80 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1.5 | @@ -71,6 +75,7 @@ Scenario: "Test Bandwidth by RGW Instance" | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph", cluster="mycluster"} | 1 1 1 | When evaluation time is `1m` And interval is `30s` + And variable `rgw_servers` is `rgw.1` Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.25 | @@ -83,6 +88,7 @@ Scenario: "Test PUT Latencies by RGW Instance" | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph", cluster="mycluster"} | 1 1 1 | When evaluation time is `1m` And interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo", cluster="mycluster"} | 1 | -- 2.47.3