From: Aashish Sharma Date: Thu, 28 Nov 2024 05:58:59 +0000 (+0530) Subject: mgr/dashboard: Add ceph_daemon filter to rgw overview grafana panel X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=a55d424c9ebc4dcad345248ce7d0fcf57b720e18;p=ceph.git mgr/dashboard: Add ceph_daemon filter to rgw overview grafana panel queries Currently rgw_servers filtering is not working in RGW Overview garfana graphs. It is showing data of all the RGW services, even though filter set to single service. This PR intends to solve this issue Fixes: https://tracker.ceph.com/issues/69074 Signed-off-by: Aashish Sharma (cherry picked from commit 666f8faf11c0a639ce6ead06026850dd72a14d41) Conflicts: monitoring/ceph-mixin/dashboards/rgw.libsonnet (conflicts with metric names like ceph_rgw_put_initial_lat_sum) monitoring/ceph-mixin/dashboards_out/radosgw-overview.json (conflicts with metrics names like ceph_rgw_put_initial_lat_sum) --- diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet index 557f1ddccb254..1e2533ecc4e2e 100644 --- a/monitoring/ceph-mixin/dashboards/rgw.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -265,7 +265,7 @@ local u = import 'utils.libsonnet'; label_replace( rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -281,7 +281,7 @@ local u = import 'utils.libsonnet'; label_replace( rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -298,7 +298,7 @@ local u = import 'utils.libsonnet'; sum by (rgw_host) ( label_replace( rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ) @@ -318,7 +318,7 @@ local u = import 'utils.libsonnet'; label_replace( rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), @@ -352,7 +352,7 @@ local u = import 'utils.libsonnet'; label_replace(sum by (instance_id) ( rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) + rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers", %(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ||| % $.matchers(), diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 92e416764e7c0..8d83bf8bbc38c 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -108,14 +108,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET {{rgw_host}}", "refId": "A" }, { - "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT {{rgw_host}}", @@ -210,7 +210,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", + "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -305,7 +305,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -502,7 +502,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -597,7 +597,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", diff --git a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature index 642e439787828..c1527a32708ac 100644 --- a/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature +++ b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature @@ -7,6 +7,7 @@ Scenario: "Test Average GET Latencies" | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `GET {{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1.5 | @@ -18,6 +19,7 @@ Scenario: "Test Average PUT Latencies" | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `Average GET/PUT Latencies by RGW Instance` with legend `PUT {{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1 | @@ -28,6 +30,7 @@ Scenario: "Test Total Requests/sec by RGW Instance" | ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.1` Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {rgw_host="1"} | 1.5 | @@ -39,6 +42,7 @@ Scenario: "Test GET Latencies by RGW Instance" | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `GET Latencies by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1.5 | @@ -71,6 +75,7 @@ Scenario: "Test Bandwidth by RGW Instance" | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | When evaluation time is `1m` And interval is `30s` + And variable `rgw_servers` is `rgw.1` Then Grafana panel `Bandwidth by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.1", instance_id="92806566", rgw_host="1"} | 2.25 | @@ -83,6 +88,7 @@ Scenario: "Test PUT Latencies by RGW Instance" | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When evaluation time is `1m` And interval is `30s` + And variable `rgw_servers` is `rgw.foo` Then Grafana panel `PUT Latencies by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | | {ceph_daemon="rgw.foo", instance="127.0.0.1", instance_id="58892247", job="ceph", rgw_host="foo"} | 1 |