From adc36dea7fc586c4d882462fbd3ab52006402b8a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Beno=C3=AEt=20Knecht?= Date: Mon, 3 Jan 2022 16:18:39 +0100 Subject: [PATCH] monitoring/grafana: Update radosgw dashboards MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit With the `ceph_daemon` label now replaced by `instance_id` on all `ceph_rgw_*` metrics, we need to update Grafana dashboards get that label back from `ceph_rgw_metadata` using this type of construct: ``` ceph_rgw_req * on (instance_id) group_left(ceph_daemon) ceph_rgw_metadata ``` Signed-off-by: Benoît Knecht --- .../grafana/dashboards/hosts-overview.json | 2 +- .../jsonnet/grafana_dashboards.jsonnet | 44 +++++++++---------- .../grafana/dashboards/radosgw-detail.json | 26 +++++------ .../grafana/dashboards/radosgw-overview.json | 14 +++--- .../tests/features/radosgw_overview.feature | 21 +++++---- 5 files changed, 55 insertions(+), 52 deletions(-) diff --git a/monitoring/grafana/dashboards/hosts-overview.json b/monitoring/grafana/dashboards/hosts-overview.json index 115c1824974..91369b56e84 100644 --- a/monitoring/grafana/dashboards/hosts-overview.json +++ b/monitoring/grafana/dashboards/hosts-overview.json @@ -796,7 +796,7 @@ "multi": false, "name": "rgw_hosts", "options": [ ], - "query": "label_values(ceph_rgw_qlen, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata, ceph_daemon)", "refresh": 1, "regex": "rgw.(.*)", "sort": 1, diff --git a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet index d9deca18cc5..270d488e0ec 100644 --- a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet +++ b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet @@ -70,7 +70,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt addTemplateSchema('mds_hosts', '$datasource', 'label_values(ceph_mds_inodes, ceph_daemon)', 1, true, 1, null, 'mds.(.*)') ) .addTemplate( - addTemplateSchema('rgw_hosts', '$datasource', 'label_values(ceph_rgw_qlen, ceph_daemon)', 1, true, 1, null, 'rgw.(.*)') + addTemplateSchema('rgw_hosts', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, null, 'rgw.(.*)') ) .addPanels([ HostsOverviewSingleStatPanel( @@ -450,7 +450,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt type='panel', id='graph', name='Graph', version='5.0.0' ) .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_req, ceph_daemon)', 1, true, 1, '', '') + addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') ) .addTemplate( addTemplateSchema('code', '$datasource', 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', 1, true, 1, 'HTTP Code', '') @@ -468,14 +468,14 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 's', 'short', - 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])', + 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', 'GET AVG', 0, 1, 8, 7 ) .addTargets( [ addTargetSchema( - 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])', + 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', 1, 'time_series', 'PUT AVG' @@ -485,7 +485,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 'none', 'short', - 'sum by(rgw_host) (label_replace(rate(ceph_rgw_req[30s]), "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', + 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', '{{rgw_host}}', 8, 1, 7, 7 ), @@ -494,7 +494,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', 's', 'short', - 'label_replace(rate(ceph_rgw_get_initial_lat_sum[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)")', + 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', '{{rgw_host}}', 15, 1, 6, 7 ), @@ -520,7 +520,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Total bytes transferred in/out through get/put operations, by radosgw instance', 'bytes', 'short', - 'sum by(rgw_host) (\n (label_replace(rate(ceph_rgw_get_b[30s]), "rgw_host","$1","ceph_daemon","rgw.(.*)")) + \n (label_replace(rate(ceph_rgw_put_b[30s]), "rgw_host","$1","ceph_daemon","rgw.(.*)"))\n)', + 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', '{{rgw_host}}', 8, 8, 7, 6 ), @@ -529,7 +529,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', 's', 'short', - 'label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),"rgw_host","$1","ceph_daemon","rgw.(.*)")', + 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', '{{rgw_host}}', 15, 8, 6, 6 ), @@ -659,7 +659,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') ) .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_req, ceph_daemon)', 1, true, 1, '', '') + addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') ) .addPanels([ addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, @@ -669,8 +669,8 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 's', 'short', - 'sum by (ceph_daemon) (rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~"($rgw_servers)"}[30s]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~"($rgw_servers)"}[30s]))', - 'sum by (ceph_daemon)(rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~"($rgw_servers)"}[30s]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~"($rgw_servers)"}[30s]))', + 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 'GET {{ceph_daemon}}', 'PUT {{ceph_daemon}}', 0, 1, 6, 8 @@ -681,8 +681,8 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 'bytes', 'short', - 'rate(ceph_rgw_get_b{ceph_daemon=~"$rgw_servers"}[30s])', - 'rate(ceph_rgw_put_b{ceph_daemon=~"$rgw_servers"}[30s])', + 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 'GETs {{ceph_daemon}}', 'PUTs {{ceph_daemon}}', 6, 1, 7, 8 @@ -693,8 +693,8 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt '', 'short', 'short', - 'rate(ceph_rgw_failed_req{ceph_daemon=~"$rgw_servers"}[30s])', - 'rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s])', + 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 'Requests Failed {{ceph_daemon}}', 'GETs {{ceph_daemon}}', 13, 1, 7, 8 @@ -702,13 +702,13 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt .addTargets( [ addTargetSchema( - 'rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s])', + 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'PUTs {{ceph_daemon}}' ), addTargetSchema( - 'rate(ceph_rgw_req{ceph_daemon=~"$rgw_servers"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s]))', + '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Other {{ceph_daemon}}' @@ -722,10 +722,10 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt 'Workload Breakdown', 'current' ) - .addTarget(addTargetSchema('rate(ceph_rgw_failed_req{ceph_daemon=~"$rgw_servers"}[30s])', 1, 'time_series', 'Failures {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s])', 1, 'time_series', 'GETs {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s])', 1, 'time_series', 'PUTs {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_req{ceph_daemon=~"$rgw_servers"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~"$rgw_servers"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~"$rgw_servers"}[30s]))', 1, 'time_series', 'Other (DELETE,LIST) {{ceph_daemon}}')) + {gridPos: {x: 20, y: 1, w: 4, h: 8}} + .addTarget(addTargetSchema('rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Failures {{ceph_daemon}}')) + .addTarget(addTargetSchema('rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'GETs {{ceph_daemon}}')) + .addTarget(addTargetSchema('rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'PUTs {{ceph_daemon}}')) + .addTarget(addTargetSchema('(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Other (DELETE,LIST) {{ceph_daemon}}')) + {gridPos: {x: 20, y: 1, w: 4, h: 8}} ]) } { @@ -1171,7 +1171,7 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt addPieChartSchema(alias, '$datasource', description, 'Under graph', 'pie', title, 'current'); local OsdOverviewSingleStatPanel(colors, format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds, expr, targetFormat, x, y, w, h) = addSingelStatSchema(colors, '$datasource', format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds) - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; + .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; dashboardSchema( 'OSD Overview', '', 'lo02I1Aiz', 'now-1h', '10s', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} diff --git a/monitoring/grafana/dashboards/radosgw-detail.json b/monitoring/grafana/dashboards/radosgw-detail.json index 432eecc837c..53486475cbb 100644 --- a/monitoring/grafana/dashboards/radosgw-detail.json +++ b/monitoring/grafana/dashboards/radosgw-detail.json @@ -104,14 +104,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (ceph_daemon) (rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s]))", + "expr": "sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET {{ceph_daemon}}", "refId": "A" }, { - "expr": "sum by (ceph_daemon)(rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s]))", + "expr": "sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT {{ceph_daemon}}", @@ -196,14 +196,14 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_get_b{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_put_b{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", @@ -294,28 +294,28 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Requests Failed {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "B" }, { - "expr": "rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", "refId": "C" }, { - "expr": "rate(ceph_rgw_req{ceph_daemon=~\"$rgw_servers\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s]))", + "expr": "(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Other {{ceph_daemon}}", @@ -384,28 +384,28 @@ "pieType": "pie", "targets": [ { - "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Failures {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "B" }, { - "expr": "rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s])", + "expr": "rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", "refId": "C" }, { - "expr": "rate(ceph_rgw_req{ceph_daemon=~\"$rgw_servers\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"$rgw_servers\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"$rgw_servers\"}[30s]))", + "expr": "(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}", @@ -450,7 +450,7 @@ "multi": false, "name": "rgw_servers", "options": [ ], - "query": "label_values(ceph_rgw_req, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata, ceph_daemon)", "refresh": 1, "regex": "", "sort": 1, diff --git a/monitoring/grafana/dashboards/radosgw-overview.json b/monitoring/grafana/dashboards/radosgw-overview.json index 489f29a2fc7..7fe94138b13 100644 --- a/monitoring/grafana/dashboards/radosgw-overview.json +++ b/monitoring/grafana/dashboards/radosgw-overview.json @@ -98,14 +98,14 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])", + "expr": "rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET AVG", "refId": "A" }, { - "expr": "rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])", + "expr": "rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT AVG", @@ -190,7 +190,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(rgw_host) (label_replace(rate(ceph_rgw_req[30s]), \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))", + "expr": "sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -275,7 +275,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(rate(ceph_rgw_get_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")", + "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n\"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -452,7 +452,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(rgw_host) (\n (label_replace(rate(ceph_rgw_get_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")) + \n (label_replace(rate(ceph_rgw_put_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\"))\n)", + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -537,7 +537,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")", + "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n\"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -1093,7 +1093,7 @@ "multi": false, "name": "rgw_servers", "options": [ ], - "query": "label_values(ceph_rgw_req, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata, ceph_daemon)", "refresh": 1, "regex": "", "sort": 1, diff --git a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature index b77d56616bd..3e9724ee2dc 100644 --- a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature +++ b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature @@ -3,27 +3,30 @@ Feature: RGW Overview Dashboard Scenario: "Test Average GET Latencies" Given the following series: | metrics | values | - | ceph_rgw_get_initial_lat_sum{ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 10 50 100 | - | ceph_rgw_get_initial_lat_count{ceph_daemon="rgw.foo", instance="127.0.0.1", job="ceph"} | 20 60 80 | + | ceph_rgw_get_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 50 100 | + | ceph_rgw_get_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 20 60 80 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When interval is `30s` Then Grafana panel `Average GET/PUT Latencies` with legend `GET AVG` shows: | metrics | values | - | {ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 2.5000000000000004 | + | {ceph_daemon="rgw.foo",instance="127.0.0.1", instance_id="58892247", job="ceph"} | 2.5000000000000004 | Scenario: "Test Average PUT Latencies" Given the following series: | metrics | values | - | ceph_rgw_put_initial_lat_sum{ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 15 35 55 | - | ceph_rgw_put_initial_lat_count{ceph_daemon="rgw.foo", instance="127.0.0.1", job="ceph"} | 10 30 50 | + | ceph_rgw_put_initial_lat_sum{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 15 35 55 | + | ceph_rgw_put_initial_lat_count{instance="127.0.0.1", instance_id="58892247", job="ceph"} | 10 30 50 | + | ceph_rgw_metadata{ceph_daemon="rgw.foo", hostname="localhost", instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 1 1 | When interval is `30s` Then Grafana panel `Average GET/PUT Latencies` with legend `PUT AVG` shows: | metrics | values | - | {ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 1 | + | {ceph_daemon="rgw.foo",instance="127.0.0.1", instance_id="58892247", job="ceph"} | 1 | Scenario: "Test Total Requests/sec by RGW Instance" Given the following series: | metrics | values | - | ceph_rgw_req{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 10 50 100 | + | ceph_rgw_req{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | + | ceph_rgw_metadata{ceph_daemon="rgw.1", hostname="localhost", instance="127.0.0.1", instance_id="92806566", job="ceph"} | 1 1 1 | When interval is `30s` Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows: | metrics | values | @@ -32,7 +35,7 @@ Scenario: "Test Total Requests/sec by RGW Instance" Scenario: "Test Bandwidth Consumed by Type- GET" Given the following series: | metrics | values | - | ceph_rgw_get_b{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 10 50 100 | + | ceph_rgw_get_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 10 50 100 | When evaluation time is `1m` And interval is `30s` Then Grafana panel `Bandwidth Consumed by Type` with legend `GETs` shows: @@ -42,7 +45,7 @@ Scenario: "Test Bandwidth Consumed by Type- GET" Scenario: "Test Bandwidth Consumed by Type- PUT" Given the following series: | metrics | values | - | ceph_rgw_put_b{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 5 20 50 | + | ceph_rgw_put_b{instance="127.0.0.1", instance_id="92806566", job="ceph"} | 5 20 50 | When evaluation time is `1m` And interval is `30s` Then Grafana panel `Bandwidth Consumed by Type` with legend `PUTs` shows: -- 2.39.5