mgr/dashboard: introduce HAProxy metrics for RGW

author Avan Thakkar <athakkar@redhat.com>

Wed, 17 Nov 2021 23:26:12 +0000 (04:56 +0530)

committer Avan Thakkar <athakkar@redhat.com>

Thu, 9 Dec 2021 14:33:03 +0000 (20:03 +0530)
author Avan Thakkar <athakkar@redhat.com>
Wed, 17 Nov 2021 23:26:12 +0000 (04:56 +0530)
committer Avan Thakkar <athakkar@redhat.com>
Thu, 9 Dec 2021 14:33:03 +0000 (20:03 +0530)
diff --git a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet

index b9f5bffaa808ac7c9b957781a6866dc61b9751a2..26a7a3c36edb58d0df555a40044c65df79613977 100644 (file)
--- a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet
+++ b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet
@@ -3,8 +3,8 @@ local g = import 'grafana.libsonnet';
  local dashboardSchema(title, description, uid, time_from, refresh, schemaVersion, tags, timezone, timepicker) =
    g.dashboard.new(title=title, description=description, uid=uid, time_from=time_from, refresh=refresh, schemaVersion=schemaVersion, tags=tags, timezone=timezone, timepicker=timepicker);
  
-local graphPanelSchema(aliasColors, title, description, nullPointMode, stack, formatY1, formatY2, labelY1, labelY2, min, fill, datasource) =
-  g.graphPanel.new(aliasColors=aliasColors, title=title, description=description, nullPointMode=nullPointMode, stack=stack, formatY1=formatY1, formatY2=formatY2, labelY1=labelY1, labelY2=labelY2, min=min, fill=fill, datasource=datasource);
+local graphPanelSchema(aliasColors, title, description, nullPointMode, stack, formatY1, formatY2, labelY1, labelY2, min, fill, datasource, legend_alignAsTable=false, legend_avg=false, legend_min=false, legend_max=false, legend_current=false, legend_values=false) =
+  g.graphPanel.new(aliasColors=aliasColors, title=title, description=description, nullPointMode=nullPointMode, stack=stack, formatY1=formatY1, formatY2=formatY2, labelY1=labelY1, labelY2=labelY2, min=min, fill=fill, datasource=datasource, legend_alignAsTable=legend_alignAsTable, legend_avg=legend_avg, legend_min=legend_min, legend_max=legend_max, legend_current=legend_current, legend_values=legend_values);
  
  local addTargetSchema(expr, intervalFactor, format, legendFormat) =
    g.prometheus.target(expr=expr, intervalFactor=intervalFactor, format=format, legendFormat=legendFormat);
@@ -250,8 +250,8 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt
  }
  {
    "radosgw-overview.json":
-    local RgwOverviewPanel(title, description, formatY1, formatY2, expr1, legendFormat1, x, y, w, h) =
-      graphPanelSchema({}, title, description, 'null', false, formatY1, formatY2, null, null, 0, 1, '$datasource')
+    local RgwOverviewPanel(title, description, formatY1, formatY2, expr1, legendFormat1, x, y, w, h, datasource='$datasource', legend_alignAsTable=false, legend_avg=false, legend_min=false, legend_max=false, legend_current=false, legend_values=false) =
+      graphPanelSchema({}, title, description, 'null', false, formatY1, formatY2, null, null, 0, 1, datasource, legend_alignAsTable, legend_avg, legend_min, legend_max, legend_current, legend_values)
        .addTargets(
          [addTargetSchema(expr1, 1, 'time_series', legendFormat1)]) + {gridPos: {x: x, y: y, w: w, h: h}};
  
@@ -271,6 +271,12 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt
      .addTemplate(
         addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_req, ceph_daemon)', 1, true, 1, '', '')
      )
+    .addTemplate(
+       addTemplateSchema('code', '$datasource', 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', 1, true, 1, 'HTTP Code', '')
+    )
+    .addTemplate(
+       addTemplateSchema('ingress_service', '$datasource', 'label_values(haproxy_server_status, instance)', 1, true, 1, 'Ingress Service', '')
+    )
      .addTemplate(
         g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
      )
@@ -289,8 +295,72 @@ local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, patt
        RgwOverviewPanel(
          'Bandwidth by RGW Instance', 'Total bytes transferred in/out through get/put operations, by radosgw instance', 'bytes', 'short', 'sum by(rgw_host) (\n  (label_replace(rate(ceph_rgw_get_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")) + \n  (label_replace(rate(ceph_rgw_put_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\"))\n)', '{{rgw_host}}', 8, 8, 7, 6),
        RgwOverviewPanel(
-        'PUT Latencies by RGW Instance', 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', 's', 'short', 'label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")', '{{rgw_host}}', 15, 8, 6, 6)
-    ])
+        'PUT Latencies by RGW Instance', 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', 's', 'short', 'label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")', '{{rgw_host}}', 15, 8, 6, 6),
+   
+      addRowSchema(false, true, 'RGW Overview - HAProxy Metrics') + {gridPos: {x: 0, y: 12, w: 9, h: 12}},
+      RgwOverviewPanel(
+        'Total responses by HTTP code', '', 'short', 'short', 'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)', 'Frontend {{ code }}', 0, 12, 5, 12, '$datasource', true, true, true, true, true, true)
+        .addTargets(
+        [addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')])
+        .addSeriesOverride([
+          { "alias": "/.*Back.*/",
+            "transform": "negative-Y" },
+          { "alias": "/.*1.*/" },
+          { "alias": "/.*2.*/" },
+          { "alias": "/.*3.*/" },
+          { "alias": "/.*4.*/" },
+          { "alias": "/.*5.*/" },
+          { "alias": "/.*other.*/" }
+        ]),
+      RgwOverviewPanel(
+        'Total requests / responses', '', 'short', 'short',
+        'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 'Requests', 5, 12, 5, 12, '$datasource', true, true, true, true, true, true)
+        .addTargets(
+        [addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'),
+        addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'),
+        addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'),
+        addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'),
+        addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'),
+        addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'),
+        ])
+        .addSeriesOverride([
+           {
+              "alias": "/.*Response.*/",
+              "transform": "negative-Y"
+            },
+            {
+              "alias": "/.*Backend.*/",
+              "transform": "negative-Y"
+            }
+        ]),
+        RgwOverviewPanel(
+        'Total number of connections', '', 'short', 'short',
+        'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 'Front', 10, 12, 5, 12, '$datasource', true, true, true, true, true, true)
+        .addTargets(
+        [addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'),
+        addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'),
+        ])
+        .addSeriesOverride([
+           {
+             "alias": "/.*Back.*/",
+             "transform": "negative-Y"
+           }
+        ]),
+        RgwOverviewPanel(
+        'Current total of incoming / outgoing bytes', '', 'short', 'short',
+        'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 'IN Front', 15, 12, 6, 12, '$datasource', true, true, true, true, true, true)
+        .addTargets(
+        [addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'),
+        addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'),
+        addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back')
+        ])
+        .addSeriesOverride([
+           {
+             "alias": "/.*OUT.*/",
+             "transform": "negative-Y"
+           }
+        ])
+      ])
  }
  {
    "radosgw-detail.json":
diff --git a/monitoring/grafana/dashboards/radosgw-overview.json b/monitoring/grafana/dashboards/radosgw-overview.json

index f996fed95e6e6a31a0f53598af5a9c196d1b66fd..489f29a2fc783c5fa0fe080104b1d3d219fd8e2b 100644 (file)
--- a/monitoring/grafana/dashboards/radosgw-overview.json
+++ b/monitoring/grafana/dashboards/radosgw-overview.json
@@ -579,6 +579,499 @@
                 "show": true
              }
           ]
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "gridPos": {
+            "h": 12,
+            "w": 9,
+            "x": 0,
+            "y": 12
+         },
+         "id": 9,
+         "panels": [ ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "RGW Overview - HAProxy Metrics",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "aliasColors": { },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "",
+         "fill": 1,
+         "gridPos": {
+            "h": 12,
+            "w": 5,
+            "x": 0,
+            "y": 12
+         },
+         "id": 10,
+         "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
+            "show": true,
+            "sideWidth": null,
+            "total": false,
+            "values": true
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "nullPointMode": "null",
+         "percentage": false,
+         "pointradius": 5,
+         "points": false,
+         "renderer": "flot",
+         "repeat": null,
+         "seriesOverrides": [
+            [
+               {
+                  "alias": "/.*Back.*/",
+                  "transform": "negative-Y"
+               },
+               {
+                  "alias": "/.*1.*/"
+               },
+               {
+                  "alias": "/.*2.*/"
+               },
+               {
+                  "alias": "/.*3.*/"
+               },
+               {
+                  "alias": "/.*4.*/"
+               },
+               {
+                  "alias": "/.*5.*/"
+               },
+               {
+                  "alias": "/.*other.*/"
+               }
+            ]
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "sum(irate(haproxy_frontend_http_responses_total{code=~\"$code\",instance=~\"$ingress_service\",proxy=~\"frontend\"}[5m])) by (code)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "Frontend {{ code }}",
+               "refId": "A"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_http_responses_total{code=~\"$code\",instance=~\"$ingress_service\",proxy=~\"backend\"}[5m])) by (code)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "Backend {{ code }}",
+               "refId": "B"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Total responses by HTTP code",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            }
+         ]
+      },
+      {
+         "aliasColors": { },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "",
+         "fill": 1,
+         "gridPos": {
+            "h": 12,
+            "w": 5,
+            "x": 5,
+            "y": 12
+         },
+         "id": 11,
+         "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
+            "show": true,
+            "sideWidth": null,
+            "total": false,
+            "values": true
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "nullPointMode": "null",
+         "percentage": false,
+         "pointradius": 5,
+         "points": false,
+         "renderer": "flot",
+         "repeat": null,
+         "seriesOverrides": [
+            [
+               {
+                  "alias": "/.*Response.*/",
+                  "transform": "negative-Y"
+               },
+               {
+                  "alias": "/.*Backend.*/",
+                  "transform": "negative-Y"
+               }
+            ]
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "sum(irate(haproxy_frontend_http_requests_total{proxy=~\"frontend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "Requests",
+               "refId": "A"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_response_errors_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "Response errors",
+               "refId": "B"
+            },
+            {
+               "expr": "sum(irate(haproxy_frontend_request_errors_total{proxy=~\"frontend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "Requests errors",
+               "refId": "C"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "Backend redispatch",
+               "refId": "D"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_retry_warnings_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "Backend retry",
+               "refId": "E"
+            },
+            {
+               "expr": "sum(irate(haproxy_frontend_requests_denied_total{proxy=~\"frontend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "Request denied",
+               "refId": "F"
+            },
+            {
+               "expr": "sum(haproxy_backend_current_queue{proxy=~\"backend\",instance=~\"$ingress_service\"}) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "Backend Queued",
+               "refId": "G"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Total requests / responses",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            }
+         ]
+      },
+      {
+         "aliasColors": { },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "",
+         "fill": 1,
+         "gridPos": {
+            "h": 12,
+            "w": 5,
+            "x": 10,
+            "y": 12
+         },
+         "id": 12,
+         "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
+            "show": true,
+            "sideWidth": null,
+            "total": false,
+            "values": true
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "nullPointMode": "null",
+         "percentage": false,
+         "pointradius": 5,
+         "points": false,
+         "renderer": "flot",
+         "repeat": null,
+         "seriesOverrides": [
+            [
+               {
+                  "alias": "/.*Back.*/",
+                  "transform": "negative-Y"
+               }
+            ]
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "sum(irate(haproxy_frontend_connections_total{proxy=~\"frontend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "Front",
+               "refId": "A"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_connection_attempts_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "Back",
+               "refId": "B"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_connection_errors_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m])) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "Back errors",
+               "refId": "C"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Total number of connections",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            }
+         ]
+      },
+      {
+         "aliasColors": { },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "",
+         "fill": 1,
+         "gridPos": {
+            "h": 12,
+            "w": 6,
+            "x": 15,
+            "y": 12
+         },
+         "id": 13,
+         "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
+            "show": true,
+            "sideWidth": null,
+            "total": false,
+            "values": true
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "nullPointMode": "null",
+         "percentage": false,
+         "pointradius": 5,
+         "points": false,
+         "renderer": "flot",
+         "repeat": null,
+         "seriesOverrides": [
+            [
+               {
+                  "alias": "/.*OUT.*/",
+                  "transform": "negative-Y"
+               }
+            ]
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "sum(irate(haproxy_frontend_bytes_in_total{proxy=~\"frontend\",instance=~\"$ingress_service\"}[5m])*8) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 1,
+               "legendFormat": "IN Front",
+               "refId": "A"
+            },
+            {
+               "expr": "sum(irate(haproxy_frontend_bytes_out_total{proxy=~\"frontend\",instance=~\"$ingress_service\"}[5m])*8) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "OUT Front",
+               "refId": "B"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_bytes_in_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m])*8) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "IN Back",
+               "refId": "C"
+            },
+            {
+               "expr": "sum(irate(haproxy_backend_bytes_out_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m])*8) by (instance)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "OUT Back",
+               "refId": "D"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Current total of incoming / outgoing bytes",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": 0,
+               "show": true
+            }
+         ]
        }
     ],
     "refresh": "15s",
@@ -610,6 +1103,46 @@
              "type": "query",
              "useTags": false
           },
+         {
+            "allValue": null,
+            "current": { },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": "HTTP Code",
+            "multi": false,
+            "name": "code",
+            "options": [ ],
+            "query": "label_values(haproxy_server_http_responses_total{instance=~\"$ingress_service\"}, code)",
+            "refresh": 1,
+            "regex": "",
+            "sort": 1,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
+         {
+            "allValue": null,
+            "current": { },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": "Ingress Service",
+            "multi": false,
+            "name": "ingress_service",
+            "options": [ ],
+            "query": "label_values(haproxy_server_status, instance)",
+            "refresh": 1,
+            "regex": "",
+            "sort": 1,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
           {
              "current": {
                 "text": "default",
diff --git a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature

new file mode 100644 (file)

index 0000000..b77d566
--- /dev/null
+++ b/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature
@@ -0,0 +1,212 @@
+Feature: RGW Overview Dashboard
+
+Scenario: "Test Average GET Latencies"
+  Given the following series:
+    | metrics | values |
+    | ceph_rgw_get_initial_lat_sum{ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 10 50 100 |
+    | ceph_rgw_get_initial_lat_count{ceph_daemon="rgw.foo", instance="127.0.0.1", job="ceph"} | 20 60 80 |
+  When interval is `30s`
+  Then Grafana panel `Average GET/PUT Latencies` with legend `GET AVG` shows:
+    | metrics | values |
+    | {ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 2.5000000000000004 |
+
+Scenario: "Test Average PUT Latencies"
+  Given the following series:
+    | metrics | values |
+    | ceph_rgw_put_initial_lat_sum{ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 15 35 55 |
+    | ceph_rgw_put_initial_lat_count{ceph_daemon="rgw.foo", instance="127.0.0.1", job="ceph"} | 10 30 50 |
+  When interval is `30s`
+  Then Grafana panel `Average GET/PUT Latencies` with legend `PUT AVG` shows:
+    | metrics | values |
+    | {ceph_daemon="rgw.foo",instance="127.0.0.1", job="ceph"} | 1 |
+
+Scenario: "Test Total Requests/sec by RGW Instance"
+  Given the following series:
+    | metrics | values |
+    | ceph_rgw_req{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 10 50 100 |
+  When interval is `30s`
+  Then Grafana panel `Total Requests/sec by RGW Instance` with legend `{{rgw_host}}` shows:
+    | metrics | values |
+    | {rgw_host="1"} | 1.6666666666666667 |
+
+Scenario: "Test Bandwidth Consumed by Type- GET"
+  Given the following series:
+    | metrics | values |
+    | ceph_rgw_get_b{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 10 50 100 |
+  When evaluation time is `1m`
+  And interval is `30s`
+  Then Grafana panel `Bandwidth Consumed by Type` with legend `GETs` shows:
+    | metrics | values |
+    | {} | 1.6666666666666667 |
+
+Scenario: "Test Bandwidth Consumed by Type- PUT"
+  Given the following series:
+    | metrics | values |
+    | ceph_rgw_put_b{ceph_daemon="rgw.1",instance="127.0.0.1",job="ceph"} | 5 20 50 |
+  When evaluation time is `1m`
+  And interval is `30s`
+  Then Grafana panel `Bandwidth Consumed by Type` with legend `PUTs` shows:
+    | metrics | values |
+    | {} | 1 |
+
+Scenario: "Test Total backend responses by HTTP code"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_http_responses_total{code="200",instance="ingress.rgw.1",proxy="backend"} | 10 100 |
+    | haproxy_backend_http_responses_total{code="404",instance="ingress.rgw.1",proxy="backend"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  When variable `code` is `200`
+  Then Grafana panel `Total responses by HTTP code` with legend `Backend {{ code }}` shows:
+    | metrics | values |
+    | {code="200"} | 1.5 |
+
+Scenario: "Test Total frontend responses by HTTP code"
+  Given the following series:
+    | metrics | values |
+    | haproxy_frontend_http_responses_total{code="200",instance="ingress.rgw.1",proxy="frontend"} | 10 100 |
+    | haproxy_frontend_http_responses_total{code="404",instance="ingress.rgw.1",proxy="frontend"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  When variable `code` is `200`
+  Then Grafana panel `Total responses by HTTP code` with legend `Frontend {{ code }}` shows:
+    | metrics | values |
+    | {code="200"} | 1.5 |
+
+Scenario: "Test Total http frontend requests by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_frontend_http_requests_total{proxy="frontend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_frontend_http_requests_total{proxy="frontend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total requests / responses` with legend `Requests` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total backend response errors by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_response_errors_total{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_response_errors_total{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total requests / responses` with legend `Response errors` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total frontend requests errors by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_frontend_request_errors_total{proxy="frontend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_frontend_request_errors_total{proxy="frontend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total requests / responses` with legend `Requests errors` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total backend redispatch warnings by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_redispatch_warnings_total{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_redispatch_warnings_total{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total requests / responses` with legend `Backend redispatch` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total backend retry warnings by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_retry_warnings_total{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_retry_warnings_total{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total requests / responses` with legend `Backend retry` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total frontend requests denied by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_frontend_requests_denied_total{proxy="frontend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_frontend_requests_denied_total{proxy="frontend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total requests / responses` with legend `Request denied` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total backend current queue by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_current_queue{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_current_queue{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total requests / responses` with legend `Backend Queued` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 200 |
+
+Scenario: "Test Total frontend connections by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_frontend_connections_total{proxy="frontend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_frontend_connections_total{proxy="frontend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total number of connections` with legend `Front` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total backend connections attempts by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_connection_attempts_total{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_connection_attempts_total{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total number of connections` with legend `Back` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total backend connections error by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_connection_errors_total{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_connection_errors_total{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Total number of connections` with legend `Back errors` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 3 |
+
+Scenario: "Test Total frontend bytes incoming by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_frontend_bytes_in_total{proxy="frontend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_frontend_bytes_in_total{proxy="frontend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Current total of incoming / outgoing bytes` with legend `IN Front` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 24 |
+
+Scenario: "Test Total frontend bytes outgoing by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_frontend_bytes_out_total{proxy="frontend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_frontend_bytes_out_total{proxy="frontend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Current total of incoming / outgoing bytes` with legend `OUT Front` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 24 |
+
+Scenario: "Test Total backend bytes incoming by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_bytes_in_total{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_bytes_in_total{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Current total of incoming / outgoing bytes` with legend `IN Back` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 24 |
+
+Scenario: "Test Total backend bytes outgoing by instance"
+  Given the following series:
+    | metrics | values |
+    | haproxy_backend_bytes_out_total{proxy="backend",instance="ingress.rgw.1"} | 10 100 |
+    | haproxy_backend_bytes_out_total{proxy="backend",instance="ingress.rgw.1"} | 20 200 |
+  When variable `ingress_service` is `ingress.rgw.1`
+  Then Grafana panel `Current total of incoming / outgoing bytes` with legend `OUT Back` shows:
+    | metrics | values |
+    | {instance="ingress.rgw.1"} | 24 |
author	Avan Thakkar <athakkar@redhat.com>
	Wed, 17 Nov 2021 23:26:12 +0000 (04:56 +0530)
committer	Avan Thakkar <athakkar@redhat.com>
	Thu, 9 Dec 2021 14:33:03 +0000 (20:03 +0530)
monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet		patch \| blob \| history
monitoring/grafana/dashboards/radosgw-overview.json		patch \| blob \| history
monitoring/grafana/dashboards/tests/features/radosgw_overview.feature	[new file with mode: 0644]	patch \| blob