From: Paul Cuzner Date: Thu, 28 Jun 2018 21:58:35 +0000 (+1200) Subject: Added RGW GET/PUT Latencies X-Git-Tag: v2.0~14^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3173b5ee15343fdfedfd2da5c371facd5b4c9e86;p=cephmetrics.git Added RGW GET/PUT Latencies Added multiple charts showing GET/PUT latencies at overview and RGW detail levels. In addition the failed HTTP request panel has been changed from a singlestat to a graph to visualize the failure rates across all RGW instances. Signed-off-by: Paul Cuzner --- diff --git a/dashboards/mgr-prometheus/ceph-rgw-workload.json b/dashboards/mgr-prometheus/ceph-rgw-workload.json index 9784269..b102420 100644 --- a/dashboards/mgr-prometheus/ceph-rgw-workload.json +++ b/dashboards/mgr-prometheus/ceph-rgw-workload.json @@ -4,7 +4,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "5.0.4" + "version": "5.0.0" }, { "type": "panel", @@ -42,7 +42,7 @@ "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1524726965851, + "iteration": 1530165442642, "links": [], "panels": [ { @@ -67,11 +67,11 @@ "fill": 1, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 1 }, - "id": 4, + "id": 29, "legend": { "avg": false, "current": false, @@ -91,21 +91,28 @@ "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by(ceph_daemon) (rate(ceph_rgw_req[30s]))", + "expr": "avg(rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{ceph_daemon}}", + "legendFormat": "GET AVG", "refId": "A" + }, + { + "expr": "avg(rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUT AVG", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Total Requests/sec by RGW Instance", + "title": "Average GET/PUT Latencies", "tooltip": { "shared": true, "sort": 0, @@ -121,8 +128,7 @@ }, "yaxes": [ { - "decimals": 0, - "format": "none", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -135,7 +141,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] }, @@ -145,15 +151,14 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "Total bytes transferred in/out of all radosgw instances within the cluster", "fill": 1, "gridPos": { "h": 7, - "w": 6, - "x": 6, + "w": 7, + "x": 8, "y": 1 }, - "id": 6, + "id": 4, "legend": { "avg": false, "current": false, @@ -177,24 +182,17 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ceph_rgw_get_b[30s]))", + "expr": "sum by(rgw_host) (label_replace(rate(ceph_rgw_req[30s]), \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "GETs", + "legendFormat": "{{rgw_host}}", "refId": "A" - }, - { - "expr": "sum(rate(ceph_rgw_put_b[30s]))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "PUTs", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Bandwidth Consumed by Type", + "title": "Total Requests/sec by RGW Instance", "tooltip": { "shared": true, "sort": 0, @@ -210,7 +208,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": 0, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -233,15 +232,15 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "Total bytes transferred in/out through get/put operations, by radosgw instance", + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts", "fill": 1, "gridPos": { "h": 7, "w": 6, - "x": 12, + "x": 15, "y": 1 }, - "id": 9, + "id": 31, "legend": { "avg": false, "current": false, @@ -265,17 +264,17 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(ceph_daemon) \n (rate(ceph_rgw_get_b[30s]) + rate(ceph_rgw_put_b[30s]))", + "expr": "label_replace(rate(ceph_rgw_get_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{ceph_daemon}}", + "legendFormat": "{{rgw_host}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Bandwidth by RGW Instance", + "title": "GET Latencies by RGW Instance", "tooltip": { "shared": true, "sort": 0, @@ -291,12 +290,13 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", - "show": true + "show": false }, { "format": "short", @@ -304,7 +304,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] }, @@ -329,7 +329,7 @@ "gridPos": { "h": 7, "w": 3, - "x": 18, + "x": 21, "y": 1 }, "id": 8, @@ -389,116 +389,361 @@ "valueName": "avg" }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": null, - "decimals": 0, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, + "description": "Total bytes transferred in/out of all radosgw instances within the cluster", + "fill": 1, "gridPos": { - "h": 7, - "w": 3, - "x": 21, - "y": 1 + "h": 6, + "w": 8, + "x": 0, + "y": 8 }, - "id": 10, - "interval": null, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, "links": [], - "mappingType": 1, - "mappingTypes": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ { - "name": "value to text", - "value": 1 + "expr": "sum(rate(ceph_rgw_get_b[30s]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs", + "refId": "A" }, { - "name": "range to text", - "value": 2 + "expr": "sum(rate(ceph_rgw_put_b[30s]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs", + "refId": "B" } ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth Consumed by Type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ { - "from": "null", - "text": "N/A", - "to": "null" + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total bytes transferred in/out through get/put operations, by radosgw instance", + "fill": 1, + "gridPos": { + "h": 6, + "w": 7, + "x": 8, + "y": 8 }, - "tableColumn": "", + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, "targets": [ { - "expr": "sum(ceph_rgw_failed_req)", + "expr": "sum by(rgw_host) (\n (label_replace(rate(ceph_rgw_get_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")) + \n (label_replace(rate(ceph_rgw_put_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\"))\n)", "format": "time_series", "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", "refId": "A" } ], - "thresholds": "", - "title": "Failed HTTP Requests", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ { - "op": "=", - "text": "N/A", - "value": "null" + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "valueName": "current" + ] }, { - "collapsed": true, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts", + "fill": 1, "gridPos": { - "h": 1, - "w": 24, - "x": 0, + "h": 6, + "w": 6, + "x": 15, "y": 8 }, - "id": 12, - "panels": [ - { - "aliasColors": { - "GETs": "#7eb26d", - "Other": "#447ebc", - "PUTs": "#eab839", - "Requests": "#3f2b5b", - "Requests Failed": "#bf1b00" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": null, - "fill": 1, - "gridPos": { + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "PUT Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": false + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Failed HTTP Requests by RGW instance", + "fill": 1, + "gridPos": { + "h": 6, + "w": 3, + "x": 21, + "y": 8 + }, + "id": 41, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(ceph_rgw_failed_req, \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Failed Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": false + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 12, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { "h": 8, - "w": 9, + "w": 6, "x": 0, - "y": 9 + "y": 15 }, - "id": 14, + "id": 34, "legend": { "avg": false, "current": false, @@ -516,44 +761,37 @@ "pointradius": 5, "points": false, "renderer": "flot", + "scopedVars": { + "rgw_servers": { + "selected": false, + "text": "rgw.rhs-srv-01", + "value": "rgw.rhs-srv-01" + } + }, "seriesOverrides": [], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s])", + "expr": "rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Requests Failed", - "refId": "B" - }, - { - "expr": "rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "GETs", - "refId": "C" - }, - { - "expr": "rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "PUTs", - "refId": "D" + "legendFormat": "GET", + "refId": "A" }, { - "expr": "rate(ceph_rgw_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s]))", + "expr": "rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Other", - "refId": "A" + "legendFormat": "PUT", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "HTTP Request Breakdown", + "title": "$rgw_servers GET/PUT Latencies", "tooltip": { "shared": true, "sort": 0, @@ -569,11 +807,11 @@ }, "yaxes": [ { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -582,7 +820,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] }, @@ -595,9 +833,9 @@ "fill": 1, "gridPos": { "h": 8, - "w": 8, - "x": 9, - "y": 9 + "w": 7, + "x": 6, + "y": 15 }, "id": 18, "legend": { @@ -617,6 +855,13 @@ "pointradius": 5, "points": false, "renderer": "flot", + "scopedVars": { + "rgw_servers": { + "selected": false, + "text": "rgw.rhs-srv-01", + "value": "rgw.rhs-srv-01" + } + }, "seriesOverrides": [], "spaceLength": 10, "stack": true, @@ -674,6 +919,120 @@ } ] }, + { + "aliasColors": { + "GETs": "#7eb26d", + "Other": "#447ebc", + "PUTs": "#eab839", + "Requests": "#3f2b5b", + "Requests Failed": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 8, + "w": 7, + "x": 13, + "y": 15 + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "scopedVars": { + "rgw_servers": { + "selected": false, + "text": "rgw.rhs-srv-01", + "value": "rgw.rhs-srv-01" + } + }, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests Failed", + "refId": "B" + }, + { + "expr": "rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs", + "refId": "C" + }, + { + "expr": "rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs", + "refId": "D" + }, + { + "expr": "rate(ceph_rgw_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) -\n (rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) +\n rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Other", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Breakdown", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, { "aliasColors": { "Failures": "#bf1b00", @@ -692,9 +1051,9 @@ "format": "none", "gridPos": { "h": 8, - "w": 7, - "x": 17, - "y": 9 + "w": 4, + "x": 20, + "y": 15 }, "id": 23, "interval": null, @@ -707,6 +1066,13 @@ "maxDataPoints": 3, "nullPointMode": "connected", "pieType": "pie", + "scopedVars": { + "rgw_servers": { + "selected": false, + "text": "rgw.rhs-srv-01", + "value": "rgw.rhs-srv-01" + } + }, "strokeWidth": 1, "targets": [ { @@ -810,5 +1176,5 @@ }, "timezone": "", "title": "Ceph RGW Workload", - "version": 15 + "version": 26 }