From: Arthur Outhenin-Chalandre Date: Mon, 16 May 2022 11:46:20 +0000 (+0200) Subject: ceph-mixin: refactor the structure of _config and utils X-Git-Tag: v16.2.11~103^2~116^2~5 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=8109f7da40ab2748cbd52b55f30712e1705f071f;p=ceph.git ceph-mixin: refactor the structure of _config and utils Before this refactor we couln't override the config externally. Now the _config is correctly propagated and not only taken from the config.libsonnet file. Signed-off-by: Arthur Outhenin-Chalandre (cherry picked from commit fd4f484d220d98ba684878c87488cd74c502b4ff) --- diff --git a/monitoring/ceph-mixin/dashboards.libsonnet b/monitoring/ceph-mixin/dashboards.libsonnet new file mode 100644 index 0000000000000..5cae183294f98 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards.libsonnet @@ -0,0 +1,10 @@ +{ + grafanaDashboards+:: + (import 'dashboards/cephfs.libsonnet') + + (import 'dashboards/host.libsonnet') + + (import 'dashboards/osd.libsonnet') + + (import 'dashboards/pool.libsonnet') + + (import 'dashboards/rbd.libsonnet') + + (import 'dashboards/rgw.libsonnet') + + { _config:: $._config }, +} diff --git a/monitoring/ceph-mixin/dashboards/cephfs.libsonnet b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet index ef7434d7f8d36..e18e295cd69bf 100644 --- a/monitoring/ceph-mixin/dashboards/cephfs.libsonnet +++ b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet @@ -1,108 +1,104 @@ local g = import 'grafonnet/grafana.libsonnet'; -local u = import 'utils.libsonnet'; -local c = (import '../mixin.libsonnet')._config; -{ - grafanaDashboards+:: { - 'cephfs-overview.json': - local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) = - u.graphPanelSchema({}, - title, - '', - 'null', - false, - formatY1, - 'short', - labelY1, - null, - 0, - 1, - '$datasource') - .addTargets( - [u.addTargetSchema(expr, legendFormat)] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; +(import 'utils.libsonnet') { + 'cephfs-overview.json': + local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) = + $.graphPanelSchema({}, + title, + '', + 'null', + false, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr, legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'MDS Performance', - '', - 'tbO9LAiZz', - 'now-1h', - '15s', - 16, - c.dashboardTags, - '', - { - refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } + $.dashboardSchema( + 'MDS Performance', + '', + 'tbO9LAiZz', + 'now-1h', + '15s', + 16, + $._config.dashboardTags, + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('mds_servers', + '$datasource', + 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + 'MDS Server', + '') + ) + .addPanels([ + $.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + CephfsOverviewGraphPanel( + 'MDS Workload - $mds_servers', + 'none', + 'Reads(-) / Writes (+)', + 'sum(rate(ceph_objecter_op_r{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % $.matchers(), + 'Read Ops', + 0, + 1, + 12, + 9 ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() - ) - .addTemplate( - u.addTemplateSchema('mds_servers', - '$datasource', - 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - true, - 1, - 'MDS Server', - '') - ) - .addPanels([ - u.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, - CephfsOverviewGraphPanel( - 'MDS Workload - $mds_servers', - 'none', - 'Reads(-) / Writes (+)', - 'sum(rate(ceph_objecter_op_r{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % u.matchers(), - 'Read Ops', - 0, - 1, - 12, - 9 - ) - .addTarget(u.addTargetSchema( - 'sum(rate(ceph_objecter_op_w{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % u.matchers(), - 'Write Ops' - )) - .addSeriesOverride( - { alias: '/.*Reads/', transform: 'negative-Y' } - ), - CephfsOverviewGraphPanel( - 'Client Request Load - $mds_servers', - 'none', - 'Client Requests', - 'ceph_mds_server_handle_client_request{%(matchers)s, ceph_daemon=~"($mds_servers).*"}' % u.matchers(), - '{{ceph_daemon}}', - 12, - 1, - 12, - 9 - ), - ]), - }, + .addTarget($.addTargetSchema( + 'sum(rate(ceph_objecter_op_w{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % $.matchers(), + 'Write Ops' + )) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + CephfsOverviewGraphPanel( + 'Client Request Load - $mds_servers', + 'none', + 'Client Requests', + 'ceph_mds_server_handle_client_request{%(matchers)s, ceph_daemon=~"($mds_servers).*"}' % $.matchers(), + '{{ceph_daemon}}', + 12, + 1, + 12, + 9 + ), + ]), } diff --git a/monitoring/ceph-mixin/dashboards/dashboards.libsonnet b/monitoring/ceph-mixin/dashboards/dashboards.libsonnet deleted file mode 100644 index d40025044fa56..0000000000000 --- a/monitoring/ceph-mixin/dashboards/dashboards.libsonnet +++ /dev/null @@ -1,7 +0,0 @@ -(import '../config.libsonnet') + -(import 'cephfs.libsonnet') + -(import 'host.libsonnet') + -(import 'osd.libsonnet') + -(import 'pool.libsonnet') + -(import 'rbd.libsonnet') + -(import 'rgw.libsonnet') diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet index a1b03b10ec8ba..d4724fc7d13f0 100644 --- a/monitoring/ceph-mixin/dashboards/host.libsonnet +++ b/monitoring/ceph-mixin/dashboards/host.libsonnet @@ -1,798 +1,794 @@ local g = import 'grafonnet/grafana.libsonnet'; -local u = import 'utils.libsonnet'; -local c = (import '../mixin.libsonnet')._config; -{ - grafanaDashboards+:: { - 'hosts-overview.json': - local HostsOverviewSingleStatPanel(format, - title, - description, - valueName, - expr, - instant, - x, - y, - w, - h) = - u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], - '$datasource', - format, - title, - description, - valueName, - false, - 100, - false, - false, - '') - .addTarget( - u.addTargetSchema(expr, '', 'time_series', 1, instant) - ) + { gridPos: { x: x, y: y, w: w, h: h } }; - - local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) = - u.graphPanelSchema( - {}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource' - ) - .addTargets( - [u.addTargetSchema( - expr, legendFormat - )] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; - - u.dashboardSchema( - 'Host Overview', - '', - 'y0KGL0iZz', - 'now-1h', - '10s', - 16, - c.dashboardTags, - '', - { - refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'default', - label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() - ) - .addTemplate( - u.addTemplateSchema('osd_hosts', - '$datasource', - 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % u.matchers(), - 1, - true, - 1, - null, - '([^.]*).*') - ) - .addTemplate( - u.addTemplateSchema('mon_hosts', - '$datasource', - 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - true, - 1, - null, - 'mon.(.*)') - ) - .addTemplate( - u.addTemplateSchema('mds_hosts', - '$datasource', - 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - true, - 1, - null, - 'mds.(.*)') - ) - .addTemplate( - u.addTemplateSchema('rgw_hosts', - '$datasource', - 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - true, - 1, - null, - 'rgw.(.*)') - ) - .addPanels([ - HostsOverviewSingleStatPanel( - 'none', - 'OSD Hosts', - '', - 'current', - 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % u.matchers(), - true, - 0, - 0, - 4, - 5 - ), - HostsOverviewSingleStatPanel( - 'percentunit', - 'AVG CPU Busy', - 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', - 'current', - ||| - avg(1 - ( - avg by(instance) ( - rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or - rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) - ) - )) - |||, - true, - 4, - 0, - 4, - 5 - ), - HostsOverviewSingleStatPanel( - 'percentunit', - 'AVG RAM Utilization', - 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', - 'current', - ||| - avg (( - ( - node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or - node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} - ) - (( - node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or - node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + - ( - node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or - node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} - ) + ( - node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or - node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} - ) + ( - node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or - node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} - ) - ) - ) / ( - node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or - node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} - )) - |||, - true, - 8, - 0, - 4, - 5 - ), - HostsOverviewSingleStatPanel( - 'none', - 'Physical IOPS', - 'IOPS Load at the device as reported by the OS on all OSD hosts', - 'current', - ||| - sum (( - rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or - rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) - ) + ( - rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or - rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) - )) - |||, - true, - 12, - 0, - 4, - 5 - ), - HostsOverviewSingleStatPanel( - 'percent', - 'AVG Disk Utilization', - 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', - 'current', - ||| - avg ( - label_replace( - (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or - (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100), - "instance", "$1", "instance", "([^.:]*).*" - ) * on(instance, device) group_left(ceph_daemon) label_replace( - label_replace( - ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"}, - "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^.:]*).*" - ) - ) - ||| % u.matchers(), - true, - 16, - 0, - 4, - 5 - ), - HostsOverviewSingleStatPanel( - 'bytes', - 'Network Load', - 'Total send/receive network load across all hosts in the ceph cluster', - 'current', - ||| - sum ( - ( - rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or - rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) - ) unless on (device, instance) - label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") - ) + - sum ( - ( - rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or - rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) - ) unless on (device, instance) - label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") - ) - ||| - , - true, - 20, - 0, - 4, - 5 - ), - HostsOverviewGraphPanel( - 'CPU Busy - Top 10 Hosts', - 'Show the top 10 busiest hosts by cpu', - 'percent', - ||| - topk(10, - 100 * ( - 1 - ( - avg by(instance) ( - rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or - rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) - ) - ) - ) - ) - |||, - '{{instance}}', - 0, - 5, - 12, - 9 - ), - HostsOverviewGraphPanel( - 'Network Load - Top 10 Hosts', 'Top 10 hosts by network load', 'Bps', ||| - topk(10, (sum by(instance) ( - ( - rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or - rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) - ) + - ( - rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or - rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) - ) unless on (device, instance) - label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")) - )) - ||| - , '{{instance}}', 12, 5, 12, 9 - ), - ]), - 'host-details.json': - local HostDetailsSingleStatPanel(format, +(import 'utils.libsonnet') { + 'hosts-overview.json': + local HostsOverviewSingleStatPanel(format, title, description, valueName, expr, + instant, x, y, w, h) = - u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], - '$datasource', - format, - title, - description, - valueName, - false, - 100, - false, - false, - '') - .addTarget(u.addTargetSchema(expr)) + { gridPos: { x: x, y: y, w: w, h: h } }; + $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget( + $.addTargetSchema(expr, '', 'time_series', 1, instant) + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - local HostDetailsGraphPanel(alias, - title, - description, - nullPointMode, - formatY1, - labelY1, - expr, - legendFormat, - x, - y, - w, - h) = - u.graphPanelSchema(alias, - title, - description, - nullPointMode, - false, - formatY1, - 'short', - labelY1, - null, - null, - 1, - '$datasource') - .addTargets( - [u.addTargetSchema(expr, legendFormat)] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; + local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) = + $.graphPanelSchema( + {}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource' + ) + .addTargets( + [$.addTargetSchema( + expr, legendFormat + )] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'Host Details', + $.dashboardSchema( + 'Host Overview', + '', + 'y0KGL0iZz', + 'now-1h', + '10s', + 16, + $._config.dashboardTags, + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('osd_hosts', + '$datasource', + 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % $.matchers(), + 1, + true, + 1, + null, + '([^.]*).*') + ) + .addTemplate( + $.addTemplateSchema('mon_hosts', + '$datasource', + 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + null, + 'mon.(.*)') + ) + .addTemplate( + $.addTemplateSchema('mds_hosts', + '$datasource', + 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + null, + 'mds.(.*)') + ) + .addTemplate( + $.addTemplateSchema('rgw_hosts', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + null, + 'rgw.(.*)') + ) + .addPanels([ + HostsOverviewSingleStatPanel( + 'none', + 'OSD Hosts', '', - 'rtOg0AiWz', - 'now-1h', - '10s', + 'current', + 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(), + true, + 0, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percentunit', + 'AVG CPU Busy', + 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', + 'current', + ||| + avg(1 - ( + avg by(instance) ( + rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or + rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) + ) + )) + |||, + true, + 4, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percentunit', + 'AVG RAM Utilization', + 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', + 'current', + ||| + avg (( + ( + node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) - (( + node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + + ( + node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) + ( + node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) + ( + node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} + ) + ) + ) / ( + node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or + node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} + )) + |||, + true, + 8, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'none', + 'Physical IOPS', + 'IOPS Load at the device as reported by the OS on all OSD hosts', + 'current', + ||| + sum (( + rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or + rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) + ) + ( + rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or + rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) + )) + |||, + true, + 12, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percent', + 'AVG Disk Utilization', + 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', + 'current', + ||| + avg ( + label_replace( + (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or + (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100), + "instance", "$1", "instance", "([^.:]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( + label_replace( + ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^.:]*).*" + ) + ) + ||| % $.matchers(), + true, 16, - c.dashboardTags + ['overview'], - '', - { - refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - u.addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard' - ) - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() - ) - .addTemplate( - u.addTemplateSchema('ceph_hosts', + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'bytes', + 'Network Load', + 'Total send/receive network load across all hosts in the ceph cluster', + 'current', + ||| + sum ( + ( + rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + + sum ( + ( + rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + ||| + , + true, + 20, + 0, + 4, + 5 + ), + HostsOverviewGraphPanel( + 'CPU Busy - Top 10 Hosts', + 'Show the top 10 busiest hosts by cpu', + 'percent', + ||| + topk(10, + 100 * ( + 1 - ( + avg by(instance) ( + rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or + rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) + ) + ) + ) + ) + |||, + '{{instance}}', + 0, + 5, + 12, + 9 + ), + HostsOverviewGraphPanel( + 'Network Load - Top 10 Hosts', 'Top 10 hosts by network load', 'Bps', ||| + topk(10, (sum by(instance) ( + ( + rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) + + ( + rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")) + )) + ||| + , '{{instance}}', 12, 5, 12, 9 + ), + ]), + 'host-details.json': + local HostDetailsSingleStatPanel(format, + title, + description, + valueName, + expr, + x, + y, + w, + h) = + $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], '$datasource', - 'label_values({%(clusterMatcher)s}, instance)' % u.matchers(), - 1, + format, + title, + description, + valueName, + false, + 100, false, - 3, - 'Hostname', - '([^.:]*).*') + false, + '') + .addTarget($.addTargetSchema(expr)) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local HostDetailsGraphPanel(alias, + title, + description, + nullPointMode, + formatY1, + labelY1, + expr, + legendFormat, + x, + y, + w, + h) = + $.graphPanelSchema(alias, + title, + description, + nullPointMode, + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr, legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'Host Details', + '', + 'rtOg0AiWz', + 'now-1h', + '10s', + 16, + $._config.dashboardTags + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard' ) - .addPanels([ - u.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, - HostDetailsSingleStatPanel( - 'none', - 'OSDs', - '', - 'current', - "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % u.matchers(), - 0, - 1, - 3, - 5 - ), - HostDetailsGraphPanel( - { - interrupt: '#447EBC', - steal: '#6D1F62', - system: '#890F02', - user: '#3F6833', - wait: '#C15C17', - }, - 'CPU Utilization', - "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", - 'null', - 'percent', - '% Utilization', - ||| - sum by (mode) ( - rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or - rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) - ) / ( - scalar( - sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) - ) * 100 - ) - |||, - '{{mode}}', - 3, - 1, - 6, - 10 - ), - HostDetailsGraphPanel( - { - Available: '#508642', - Free: '#508642', - Total: '#bf1b00', - Used: '#bf1b00', - total: '#bf1b00', - used: '#0a50a1', - }, - 'RAM Usage', - '', - 'null', - 'bytes', - 'RAM used', - ||| - node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or - node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - |||, - 'Free', - 9, - 1, - 6, - 10 - ) - .addTargets( - [ - u.addTargetSchema( - ||| + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('ceph_hosts', + '$datasource', + 'label_values({%(clusterMatcher)s}, instance)' % $.matchers(), + 1, + false, + 3, + 'Hostname', + '([^.:]*).*') + ) + .addPanels([ + $.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + HostDetailsSingleStatPanel( + 'none', + 'OSDs', + '', + 'current', + "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % $.matchers(), + 0, + 1, + 3, + 5 + ), + HostDetailsGraphPanel( + { + interrupt: '#447EBC', + steal: '#6D1F62', + system: '#890F02', + user: '#3F6833', + wait: '#C15C17', + }, + 'CPU Utilization', + "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", + 'null', + 'percent', + '% Utilization', + ||| + sum by (mode) ( + rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or + rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) + ) / ( + scalar( + sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) + ) * 100 + ) + |||, + '{{mode}}', + 3, + 1, + 6, + 10 + ), + HostDetailsGraphPanel( + { + Available: '#508642', + Free: '#508642', + Total: '#bf1b00', + Used: '#bf1b00', + total: '#bf1b00', + used: '#0a50a1', + }, + 'RAM Usage', + '', + 'null', + 'bytes', + 'RAM used', + ||| + node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + |||, + 'Free', + 9, + 1, + 6, + 10 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + |||, + 'total' + ), + $.addTargetSchema( + ||| + ( + node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ( + node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ( + node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + |||, + 'buffers/cache' + ), + $.addTargetSchema( + ||| + ( node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or - node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - |||, - 'total' - ), - u.addTargetSchema( - ||| + node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) - ( ( + node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or + node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} + ) + ( node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ) + ( node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - ) + ( + ) + + ( node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ) - |||, - 'buffers/cache' - ), - u.addTargetSchema( - ||| - ( - node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or - node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - ) - ( - ( - node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or - node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - ) + ( - node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or - node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - ) + ( - node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or - node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - ) + - ( - node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or - node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} - ) - ) - |||, - 'used' - ), - ] - ) - .addSeriesOverride( - { - alias: 'total', - color: '#bf1b00', - fill: 0, - linewidth: 2, - stack: false, - } - ), - HostDetailsGraphPanel( - {}, - 'Network Load', - "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", - 'null', - 'decbytes', - 'Send (-) / Receive (+)', - ||| - sum by (device) ( - rate( - node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or - rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval] ) + |||, + 'used' + ), + ] + ) + .addSeriesOverride( + { + alias: 'total', + color: '#bf1b00', + fill: 0, + linewidth: 2, + stack: false, + } + ), + HostDetailsGraphPanel( + {}, + 'Network Load', + "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", + 'null', + 'decbytes', + 'Send (-) / Receive (+)', + ||| + sum by (device) ( + rate( + node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval] ) - |||, - '{{device}}.rx', - 15, - 1, - 6, - 10 - ) - .addTargets( - [ - u.addTargetSchema( - ||| - sum by (device) ( - rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or - rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) - ) - |||, - '{{device}}.tx' - ), - ] - ) - .addSeriesOverride( - { alias: '/.*tx/', transform: 'negative-Y' } - ), - HostDetailsGraphPanel( - {}, - 'Network drop rate', - '', - 'null', - 'pps', - 'Send (-) / Receive (+)', - ||| - rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) - |||, - '{{device}}.rx', - 21, - 1, - 3, - 5 - ) - .addTargets( - [ - u.addTargetSchema( - ||| - rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) - |||, - '{{device}}.tx' - ), - ] - ) - .addSeriesOverride( - { - alias: '/.*tx/', - transform: 'negative-Y', - } - ), - HostDetailsSingleStatPanel( - 'bytes', - 'Raw Capacity', - 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', - 'current', - ||| - sum( - ceph_osd_stat_bytes{%(matchers)s} and - on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"} - ) - ||| % u.matchers(), - 0, - 6, - 3, - 5 - ), - HostDetailsGraphPanel( - {}, - 'Network error rate', - '', - 'null', - 'pps', - 'Send (-) / Receive (+)', - ||| - rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) - |||, - '{{device}}.rx', - 21, - 6, - 3, - 5 - ) - .addTargets( - [u.addTargetSchema( + ) + |||, + '{{device}}.rx', + 15, + 1, + 6, + 10 + ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum by (device) ( + rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) + ) + |||, + '{{device}}.tx' + ), + ] + ) + .addSeriesOverride( + { alias: '/.*tx/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + 'Network drop rate', + '', + 'null', + 'pps', + 'Send (-) / Receive (+)', + ||| + rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + |||, + '{{device}}.rx', + 21, + 1, + 3, + 5 + ) + .addTargets( + [ + $.addTargetSchema( ||| - rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) |||, '{{device}}.tx' - )] - ) - .addSeriesOverride( - { - alias: '/.*tx/', - transform: 'negative-Y', - } - ), - u.addRowSchema(false, - true, - 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } }, - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk IOPS', - "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", - 'connected', - 'ops', - 'Read (-) / Write (+)', + ), + ] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + HostDetailsSingleStatPanel( + 'bytes', + 'Raw Capacity', + 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', + 'current', + ||| + sum( + ceph_osd_stat_bytes{%(matchers)s} and + on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"} + ) + ||| % $.matchers(), + 0, + 6, + 3, + 5 + ), + HostDetailsGraphPanel( + {}, + 'Network error rate', + '', + 'null', + 'pps', + 'Send (-) / Receive (+)', + ||| + rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + |||, + '{{device}}.rx', + 21, + 6, + 3, + 5 + ) + .addTargets( + [$.addTargetSchema( ||| + rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) + |||, + '{{device}}.tx' + )] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + $.addRowSchema(false, + true, + 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } }, + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk IOPS', + "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", + 'connected', + 'ops', + 'Read (-) / Write (+)', + ||| + label_replace( + ( + rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), "instance", "$1", "instance", "([^:.]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( label_replace( - ( - rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) - ), "instance", "$1", "instance", "([^:.]*).*" - ) * on(instance, device) group_left(ceph_daemon) label_replace( + ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) writes', + 0, + 12, + 11, + 9 + ) + .addTargets( + [ + $.addTargetSchema( + ||| label_replace( - ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - '{{device}}({{ceph_daemon}}) writes', - 0, - 12, - 11, - 9 - ) - .addTargets( - [ - u.addTargetSchema( - ||| + ( + rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), "instance", "$1", "instance", "([^:.]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( label_replace( - ( - rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) - ), "instance", "$1", "instance", "([^:.]*).*" - ) * on(instance, device) group_left(ceph_daemon) label_replace( - label_replace( - ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - '{{device}}({{ceph_daemon}}) reads' - ), - ] - ) - .addSeriesOverride( - { alias: '/.*reads/', transform: 'negative-Y' } - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Throughput by Disk', - 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', - 'connected', - 'Bps', - 'Read (-) / Write (+)', + ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) reads' + ), + ] + ) + .addSeriesOverride( + { alias: '/.*reads/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Throughput by Disk', + 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', + 'connected', + 'Bps', + 'Read (-) / Write (+)', + ||| + label_replace( + ( + rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) + group_left(ceph_daemon) label_replace( + label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), + "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) write', + 12, + 12, + 11, + 9 + ) + .addTargets( + [$.addTargetSchema( ||| label_replace( ( - rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) - ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) + rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) + ), + "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace( label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" ) - ||| % u.matchers(), - '{{device}}({{ceph_daemon}}) write', - 12, - 12, - 11, - 9 - ) - .addTargets( - [u.addTargetSchema( - ||| - label_replace( - ( - rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or - rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) - ), - "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) - group_left(ceph_daemon) label_replace( - label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), - "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - '{{device}}({{ceph_daemon}}) read' - )] - ) - .addSeriesOverride( - { alias: '/.*read/', transform: 'negative-Y' } - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk Latency', - "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", - 'null as zero', - 's', - '', - ||| - max by(instance, device) (label_replace( - (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / - clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or - (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / - clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001), - "instance", "$1", "instance", "([^:.]*).*" - )) * on(instance, device) group_left(ceph_daemon) label_replace( - label_replace( - ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, - "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - '{{device}}({{ceph_daemon}})', - 0, - 21, - 11, - 9 - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk utilization', - 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', - 'connected', - 'percent', - '%Util', - ||| + ||| % $.matchers(), + '{{device}}({{ceph_daemon}}) read' + )] + ) + .addSeriesOverride( + { alias: '/.*read/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk Latency', + "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", + 'null as zero', + 's', + '', + ||| + max by(instance, device) (label_replace( + (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / + clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or + (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / + clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001), + "instance", "$1", "instance", "([^:.]*).*" + )) * on(instance, device) group_left(ceph_daemon) label_replace( label_replace( - ( - (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or - rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100 - ), "instance", "$1", "instance", "([^:.]*).*" - ) * on(instance, device) group_left(ceph_daemon) label_replace( - label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}, - "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - '{{device}}({{ceph_daemon}})', - 12, - 21, - 11, - 9 - ), - ]), - }, + ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}})', + 0, + 21, + 11, + 9 + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk utilization', + 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', + 'connected', + 'percent', + '%Util', + ||| + label_replace( + ( + (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or + rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100 + ), "instance", "$1", "instance", "([^:.]*).*" + ) * on(instance, device) group_left(ceph_daemon) label_replace( + label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}, + "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}}({{ceph_daemon}})', + 12, + 21, + 11, + 9 + ), + ]), } diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet index b08efa3637f03..959cc51e45c58 100644 --- a/monitoring/ceph-mixin/dashboards/osd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -1,587 +1,544 @@ local g = import 'grafonnet/grafana.libsonnet'; -local u = import 'utils.libsonnet'; -local c = (import '../mixin.libsonnet')._config; -{ - grafanaDashboards+:: { - 'osds-overview.json': - local OsdOverviewStyle(alias, pattern, type, unit) = - u.addStyle(alias, null, [ - 'rgba(245, 54, 54, 0.9)', - 'rgba(237, 129, 40, 0.89)', - 'rgba(50, 172, 45, 0.97)', - ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); - local OsdOverviewGraphPanel(alias, - title, - description, - formatY1, - labelY1, - min, - expr, - legendFormat1, - x, - y, - w, - h) = - u.graphPanelSchema(alias, - title, - description, - 'null', - false, - formatY1, - 'short', - labelY1, - null, - min, - 1, - '$datasource') - .addTargets( - [u.addTargetSchema(expr, legendFormat1)] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; - local OsdOverviewPieChartPanel(alias, description, title) = - u.addPieChartSchema(alias, - '$datasource', - description, - 'Under graph', - 'pie', - title, - 'current'); - local OsdOverviewSingleStatPanel(colors, - format, - title, - description, - valueName, - colorValue, - gaugeMaxValue, - gaugeShow, - sparkLineShow, - thresholds, - expr, - x, - y, - w, - h) = - u.addSingleStatSchema( - colors, - '$datasource', - format, - title, - description, - valueName, - colorValue, - gaugeMaxValue, - gaugeShow, - sparkLineShow, - thresholds - ) - .addTarget( - u.addTargetSchema(expr) - ) + { gridPos: { x: x, y: y, w: w, h: h } }; - - u.dashboardSchema( - 'OSD Overview', - '', - 'lo02I1Aiz', - 'now-1h', - '10s', - 16, - c.dashboardTags, - '', - { - refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' +(import 'utils.libsonnet') { + 'osds-overview.json': + local OsdOverviewStyle(alias, pattern, type, unit) = + $.addStyle(alias, null, [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); + local OsdOverviewGraphPanel(alias, + title, + description, + formatY1, + labelY1, + min, + expr, + legendFormat1, + x, + y, + w, + h) = + $.graphPanelSchema(alias, + title, + description, + 'null', + false, + formatY1, + 'short', + labelY1, + null, + min, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr, legendFormat1)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + local OsdOverviewPieChartPanel(alias, description, title) = + $.addPieChartSchema(alias, + '$datasource', + description, + 'Under graph', + 'pie', + title, + 'current'); + local OsdOverviewSingleStatPanel(colors, + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds, + expr, + x, + y, + w, + h) = + $.addSingleStatSchema( + colors, + '$datasource', + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds ) - .addRequired( - type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='table', name='Table', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() + .addTarget( + $.addTargetSchema(expr) + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'OSD Overview', + '', + 'lo02I1Aiz', + 'now-1h', + '10s', + 16, + $._config.dashboardTags, + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' ) - .addTemplate( - u.addJobTemplate() + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addPanels([ + OsdOverviewGraphPanel( + { '@95%ile': '#e0752d' }, + 'OSD Read Latencies', + '', + 'ms', + null, + '0', + ||| + avg ( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 + ) + ||| % $.matchers(), + 'AVG read', + 0, + 0, + 8, + 8 ) - .addPanels([ - OsdOverviewGraphPanel( - { '@95%ile': '#e0752d' }, - 'OSD Read Latencies', - '', - 'ms', - null, - '0', - ||| - avg ( - rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + .addTargets( + [ + $.addTargetSchema( + ||| + max( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 - ) - ||| % u.matchers(), - 'AVG read', - 0, - 0, - 8, - 8 - ) - .addTargets( - [ - u.addTargetSchema( - ||| - max( + ) + ||| % $.matchers(), + 'MAX read' + ), + $.addTargetSchema( + ||| + quantile(0.95, + ( rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) + * 1000 ) - ||| % u.matchers(), - 'MAX read' - ), - u.addTargetSchema( - ||| - quantile(0.95, - ( - rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) - * 1000 - ) - ) - ||| % u.matchers(), - '@95%ile' - ), - ], - ), - u.addTableSchema( - '$datasource', - "This table shows the osd's that are delivering the 10 highest read latencies within the cluster", - { col: 2, desc: true }, - [ - OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'), - OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), - OsdOverviewStyle('', '/.*/', 'hidden', 'short'), - ], - 'Highest READ Latencies', - 'table' - ) - .addTarget( - u.addTargetSchema( - ||| - topk(10, - (sort( - ( - rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * - 1000 - ) - )) ) - ||| % u.matchers(), - '', - 'table', - 1, - true - ) - ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } }, - OsdOverviewGraphPanel( - { - '@95%ile write': '#e0752d', - }, - 'OSD Write Latencies', - '', - 'ms', - null, - '0', + ||| % $.matchers(), + '@95%ile' + ), + ], + ), + $.addTableSchema( + '$datasource', + "This table shows the osd's that are delivering the 10 highest read latencies within the cluster", + { col: 2, desc: true }, + [ + OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'), + OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), + OsdOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest READ Latencies', + 'table' + ) + .addTarget( + $.addTargetSchema( ||| - avg( - rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) - * 1000 - ) - ||| % u.matchers(), - 'AVG write', - 12, - 0, - 8, - 8 - ) - .addTargets( - [ - u.addTargetSchema( - ||| - max( - rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * + topk(10, + (sort( + ( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 ) - ||| % u.matchers(), 'MAX write' - ), - u.addTargetSchema( - ||| - quantile(0.95, ( - rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * - 1000 - )) - ||| % u.matchers(), '@95%ile write' - ), - ], - ), - u.addTableSchema( - '$datasource', - "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", - { col: 2, desc: true }, - [ - OsdOverviewStyle( - 'OSD ID', 'ceph_daemon', 'string', 'short' - ), - OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), - OsdOverviewStyle('', '/.*/', 'hidden', 'short'), - ], - 'Highest WRITE Latencies', - 'table' + )) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true ) - .addTarget( - u.addTargetSchema( + ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } }, + OsdOverviewGraphPanel( + { + '@95%ile write': '#e0752d', + }, + 'OSD Write Latencies', + '', + 'ms', + null, + '0', + ||| + avg( + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) + * 1000 + ) + ||| % $.matchers(), + 'AVG write', + 12, + 0, + 8, + 8 + ) + .addTargets( + [ + $.addTargetSchema( ||| - topk(10, - (sort( - (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * - 1000) - )) + max( + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * + 1000 ) - ||| % u.matchers(), - '', - 'table', - 1, - true - ) - ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } }, - OsdOverviewPieChartPanel( - {}, '', 'OSD Types Summary' - ) - .addTarget( - u.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % u.matchers(), '{{device_class}}') - ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } }, - OsdOverviewPieChartPanel( - { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types' - ) - .addTarget( - u.addTargetSchema( - 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % u.matchers(), 'bluestore', 'time_series', 2 - ) - ) - .addTarget( - u.addTargetSchema( - 'absent(ceph_bluefs_wal_total_bytes%(matchers)s) * count(ceph_osd_metadata{%(matchers)s})' % u.matchers(), 'filestore', 'time_series', 2 - ) - ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } }, - OsdOverviewPieChartPanel( - {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary' - ) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % u.matchers(), '<1TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % u.matchers(), '<2TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % u.matchers(), '<3TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % u.matchers(), '<4TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % u.matchers(), '<6TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % u.matchers(), '<8TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % u.matchers(), '<10TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % u.matchers(), '<12TB', 'time_series', 2 - )) - .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % u.matchers(), '<12TB+', 'time_series', 2 - )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } }, - g.graphPanel.new(bars=true, - datasource='$datasource', - title='Distribution of PGs per OSD', - x_axis_buckets=20, - x_axis_mode='histogram', - x_axis_values=['total'], - formatY1='short', - formatY2='short', - labelY1='# of OSDs', - min='0', - nullPointMode='null') - .addTarget(u.addTargetSchema( - 'ceph_osd_numpg{%(matchers)s}' % u.matchers(), 'PGs per OSD', 'time_series', 1, true - )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } }, - OsdOverviewSingleStatPanel( - ['#d44a3a', '#299c46'], - 'percentunit', - 'OSD onode Hits Ratio', - 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster', - 'current', - true, - 1, - true, - false, - '.75', + ||| % $.matchers(), 'MAX write' + ), + $.addTargetSchema( + ||| + quantile(0.95, ( + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * + 1000 + )) + ||| % $.matchers(), '@95%ile write' + ), + ], + ), + $.addTableSchema( + '$datasource', + "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", + { col: 2, desc: true }, + [ + OsdOverviewStyle( + 'OSD ID', 'ceph_daemon', 'string', 'short' + ), + OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), + OsdOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest WRITE Latencies', + 'table' + ) + .addTarget( + $.addTargetSchema( ||| - sum(ceph_bluestore_onode_hits{%(matchers)s}) / ( - sum(ceph_bluestore_onode_hits{%(matchers)s}) + - sum(ceph_bluestore_onode_misses{%(matchers)s}) + topk(10, + (sort( + (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * + 1000) + )) ) - ||| % u.matchers(), - 20, - 8, - 4, - 8 - ), - u.addRowSchema(false, - true, - 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } }, - OsdOverviewGraphPanel( - {}, - 'Read/Write Profile', - 'Show the read/write workload profile overtime', - 'short', - null, - null, - 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % u.matchers(), - 'Reads', - 0, - 17, - 24, - 8 + ||| % $.matchers(), + '', + 'table', + 1, + true ) - .addTargets([u.addTargetSchema( - 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % u.matchers(), 'Writes' - )]), - ]), - 'osd-device-details.json': - local OsdDeviceDetailsPanel(title, - description, - formatY1, - labelY1, - expr1, - expr2, - legendFormat1, - legendFormat2, - x, - y, - w, - h) = - u.graphPanelSchema({}, - title, - description, - 'null', - false, - formatY1, - 'short', - labelY1, - null, - null, - 1, - '$datasource') - .addTargets( - [ - u.addTargetSchema(expr1, - legendFormat1), - u.addTargetSchema(expr2, legendFormat2), - ] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; - - u.dashboardSchema( - 'OSD device details', - '', - 'CrAHE0iZz', - 'now-3h', - '', - 16, - c.dashboardTags, - '', - { - refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } + ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + {}, '', 'OSD Types Summary' ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' + .addTarget( + $.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}') + ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types' + ) + .addTarget( + $.addTargetSchema( + 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2 ) ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' + .addTarget( + $.addTargetSchema( + 'absent(ceph_bluefs_wal_total_bytes%(matchers)s) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2 + ) + ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary' ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2 + )) + .addTarget($.addTargetSchema( + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2 + )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } }, + g.graphPanel.new(bars=true, + datasource='$datasource', + title='Distribution of PGs per OSD', + x_axis_buckets=20, + x_axis_mode='histogram', + x_axis_values=['total'], + formatY1='short', + formatY2='short', + labelY1='# of OSDs', + min='0', + nullPointMode='null') + .addTarget($.addTargetSchema( + 'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true + )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } }, + OsdOverviewSingleStatPanel( + ['#d44a3a', '#299c46'], + 'percentunit', + 'OSD onode Hits Ratio', + 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster', + 'current', + true, + 1, + true, + false, + '.75', + ||| + sum(ceph_bluestore_onode_hits{%(matchers)s}) / ( + sum(ceph_bluestore_onode_hits{%(matchers)s}) + + sum(ceph_bluestore_onode_misses{%(matchers)s}) + ) + ||| % $.matchers(), + 20, + 8, + 4, + 8 + ), + $.addRowSchema(false, + true, + 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } }, + OsdOverviewGraphPanel( + {}, + 'Read/Write Profile', + 'Show the read/write workload profile overtime', + 'short', + null, + null, + 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'Reads', + 0, + 17, + 24, + 8 ) - .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'default', - label='Data Source') + .addTargets([$.addTargetSchema( + 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes' + )]), + ]), + 'osd-device-details.json': + local OsdDeviceDetailsPanel(title, + description, + formatY1, + labelY1, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + $.graphPanelSchema({}, + title, + description, + 'null', + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema(expr1, + legendFormat1), + $.addTargetSchema(expr2, legendFormat2), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'OSD device details', + '', + 'CrAHE0iZz', + 'now-3h', + '', + 16, + $._config.dashboardTags, + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' ) - .addTemplate( - u.addClusterTemplate() + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('osd', + '$datasource', + 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + false, + 1, + 'OSD', + '(.*)') + ) + .addPanels([ + $.addRowSchema( + false, true, 'OSD Performance' + ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + '$osd Latency', + '', + 's', + 'Read (-) / Write (+)', + ||| + rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + ||| + rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + 'read', + 'write', + 0, + 1, + 6, + 9 ) - .addTemplate( - u.addJobTemplate() + .addSeriesOverride( + { + alias: 'read', + transform: 'negative-Y', + } + ), + OsdDeviceDetailsPanel( + '$osd R/W IOPS', + '', + 'short', + 'Read (-) / Write (+)', + 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(), + 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(), + 'Reads', + 'Writes', + 6, + 1, + 6, + 9 ) - .addTemplate( - u.addTemplateSchema('osd', - '$datasource', - 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - false, - 1, - 'OSD', - '(.*)') + .addSeriesOverride( + { alias: 'Reads', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + '$osd R/W Bytes', + '', + 'bytes', + 'Read (-) / Write (+)', + 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(), + 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(), + 'Read Bytes', + 'Write Bytes', + 12, + 1, + 6, + 9 ) - .addPanels([ - u.addRowSchema( - false, true, 'OSD Performance' - ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, - OsdDeviceDetailsPanel( - '$osd Latency', - '', - 's', - 'Read (-) / Write (+)', - ||| - rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) - ||| % u.matchers(), - ||| - rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) / - on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) - ||| % u.matchers(), - 'read', - 'write', - 0, - 1, - 6, - 9 - ) - .addSeriesOverride( - { - alias: 'read', - transform: 'negative-Y', - } - ), - OsdDeviceDetailsPanel( - '$osd R/W IOPS', - '', - 'short', - 'Read (-) / Write (+)', - 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), - 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), - 'Reads', - 'Writes', - 6, - 1, - 6, - 9 - ) - .addSeriesOverride( - { alias: 'Reads', transform: 'negative-Y' } - ), - OsdDeviceDetailsPanel( - '$osd R/W Bytes', - '', - 'bytes', - 'Read (-) / Write (+)', - 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), - 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), - 'Read Bytes', - 'Write Bytes', - 12, - 1, - 6, - 9 - ) - .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }), - u.addRowSchema( - false, true, 'Physical Device Performance' - ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } }, - OsdDeviceDetailsPanel( - 'Physical Device Latency for $osd', - '', - 's', - 'Read (-) / Write (+)', - ||| - ( - label_replace( - rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) / - rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]), - "instance", "$1", "instance", "([^:.]*).*" - ) and on (instance, device) label_replace( - label_replace( - ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, - "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ) - ||| % u.matchers(), - ||| - ( - label_replace( - rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) / - rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]), - "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) - label_replace( - label_replace( - ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ) - ||| % u.matchers(), - '{{instance}}/{{device}} Reads', - '{{instance}}/{{device}} Writes', - 0, - 11, - 6, - 9 - ) - .addSeriesOverride( - { alias: '/.*Reads/', transform: 'negative-Y' } - ), - OsdDeviceDetailsPanel( - 'Physical Device R/W IOPS for $osd', - '', - 'short', - 'Read (-) / Write (+)', - ||| + .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }), + $.addRowSchema( + false, true, 'Physical Device Performance' + ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + 'Physical Device Latency for $osd', + '', + 's', + 'Read (-) / Write (+)', + ||| + ( label_replace( - rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]), + rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) / + rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" ) and on (instance, device) label_replace( label_replace( @@ -589,90 +546,129 @@ local c = (import '../mixin.libsonnet')._config; "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - ||| % u.matchers(), - ||| + ) + ||| % $.matchers(), + ||| + ( label_replace( - rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]), - "instance", "$1", "instance", "([^:.]*).*" - ) and on (instance, device) label_replace( + rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) / + rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace( - ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, - "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" + label_replace( + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) ) - ||| % u.matchers(), - '{{device}} on {{instance}} Writes', - '{{device}} on {{instance}} Reads', - 6, - 11, - 6, - 9 - ) - .addSeriesOverride( - { alias: '/.*Reads/', transform: 'negative-Y' } - ), - OsdDeviceDetailsPanel( - 'Physical Device R/W Bytes for $osd', - '', - 'Bps', - 'Read (-) / Write (+)', - ||| + ||| % $.matchers(), + '{{instance}}/{{device}} Reads', + '{{instance}}/{{device}} Writes', + 0, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W IOPS for $osd', + '', + 'short', + 'Read (-) / Write (+)', + ||| + label_replace( + rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( label_replace( - rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" - ) and on (instance, device) label_replace( - label_replace( - ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, - "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - ||| + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + ||| + label_replace( + rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( label_replace( - rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" - ) and on (instance, device) label_replace( - label_replace( - ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, - "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - '{{instance}} {{device}} Reads', - '{{instance}} {{device}} Writes', - 12, - 11, - 6, - 9 - ) - .addSeriesOverride( - { alias: '/.*Reads/', transform: 'negative-Y' } - ), - u.graphPanelSchema( - {}, - 'Physical Device Util% for $osd', - '', - 'null', - false, - 'percentunit', - 'short', - null, - null, - null, - 1, - '$datasource' - ) - .addTarget(u.addTargetSchema( - ||| + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}} on {{instance}} Writes', + '{{device}} on {{instance}} Reads', + 6, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W Bytes for $osd', + '', + 'Bps', + 'Read (-) / Write (+)', + ||| + label_replace( + rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( label_replace( - rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]), - "instance", "$1", "instance", "([^:.]*).*" - ) and on (instance, device) label_replace( - label_replace( - ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" - ), "instance", "$1", "instance", "([^:.]*).*" - ) - ||| % u.matchers(), - '{{device}} on {{instance}}' - )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } }, - ]), - }, + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + ||| + label_replace( + rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, + "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{instance}} {{device}} Reads', + '{{instance}} {{device}} Writes', + 12, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + $.graphPanelSchema( + {}, + 'Physical Device Util% for $osd', + '', + 'null', + false, + 'percentunit', + 'short', + null, + null, + null, + 1, + '$datasource' + ) + .addTarget($.addTargetSchema( + ||| + label_replace( + rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]), + "instance", "$1", "instance", "([^:.]*).*" + ) and on (instance, device) label_replace( + label_replace( + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" + ), "instance", "$1", "instance", "([^:.]*).*" + ) + ||| % $.matchers(), + '{{device}} on {{instance}}' + )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } }, + ]), } diff --git a/monitoring/ceph-mixin/dashboards/pool.libsonnet b/monitoring/ceph-mixin/dashboards/pool.libsonnet index 400a07e19e672..b87588250412e 100644 --- a/monitoring/ceph-mixin/dashboards/pool.libsonnet +++ b/monitoring/ceph-mixin/dashboards/pool.libsonnet @@ -1,467 +1,55 @@ local g = import 'grafonnet/grafana.libsonnet'; -local u = import 'utils.libsonnet'; -local c = (import '../mixin.libsonnet')._config; -{ - grafanaDashboards+:: { - 'pool-overview.json': - local PoolOverviewSingleStatPanel(format, - title, - description, - valueName, - expr, - instant, - targetFormat, - x, - y, - w, - h) = - u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], - '$datasource', - format, - title, - description, - valueName, - false, - 100, - false, - false, - '') - .addTarget(u.addTargetSchema(expr, '', targetFormat, 1, instant)) + { gridPos: { x: x, y: y, w: w, h: h } }; - - local PoolOverviewStyle(alias, - pattern, - type, - unit, - colorMode, - thresholds, - valueMaps) = - u.addStyle(alias, - colorMode, - [ - 'rgba(245, 54, 54, 0.9)', - 'rgba(237, 129, 40, 0.89)', - 'rgba(50, 172, 45, 0.97)', - ], - 'YYYY-MM-DD HH:mm:ss', - 2, - 1, - pattern, - thresholds, - type, - unit, - valueMaps); - - local PoolOverviewGraphPanel(title, - description, - formatY1, - labelY1, - expr, - legendFormat, - x, - y, - w, - h) = - u.graphPanelSchema({}, - title, - description, - 'null as zero', - false, - formatY1, - 'short', - labelY1, - null, - 0, - 1, - '$datasource') - .addTargets( - [u.addTargetSchema(expr, - legendFormat)] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; - - u.dashboardSchema( - 'Ceph Pools Overview', - '', - 'z99hzWtmk', - 'now-1h', - '15s', - 22, - c.dashboardTags, - '', - { - refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() - ) - .addTemplate( - g.template.custom(label='TopK', - name='topk', - current='15', - query='15') - ) - .addPanels([ - PoolOverviewSingleStatPanel( - 'none', - 'Pools', - '', - 'avg', - 'count(ceph_pool_metadata{%(matchers)s})' % u.matchers(), - true, - 'table', - 0, - 0, - 3, - 3 - ), - PoolOverviewSingleStatPanel( - 'none', - 'Pools with Compression', - 'Count of the pools that have compression enabled', - 'current', - 'count(ceph_pool_metadata{%(matchers)s, compression_mode!="none"})' % u.matchers(), - null, - '', - 3, - 0, - 3, - 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Total Raw Capacity', - 'Total raw capacity available to the cluster', - 'current', - 'sum(ceph_osd_stat_bytes{%(matchers)s})' % u.matchers(), - null, - '', - 6, - 0, - 3, - 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Raw Capacity Consumed', - 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', - 'current', - 'sum(ceph_pool_bytes_used{%(matchers)s})' % u.matchers(), - true, - '', - 9, - 0, - 3, - 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Logical Stored ', - 'Total of client data stored in the cluster', - 'current', - 'sum(ceph_pool_stored{%(matchers)s})' % u.matchers(), - true, - '', - 12, - 0, - 3, - 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Compression Savings', - 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', - 'current', - ||| - sum( - ceph_pool_compress_under_bytes{%(matchers)s} - - ceph_pool_compress_bytes_used{%(matchers)s} - ) - ||| % u.matchers(), - null, - '', - 15, - 0, - 3, - 3 - ), - PoolOverviewSingleStatPanel( - 'percent', - 'Compression Eligibility', - 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data', - 'current', - ||| - ( - sum(ceph_pool_compress_under_bytes{%(matchers)s} > 0) / - sum(ceph_pool_stored_raw{%(matchers)s} and ceph_pool_compress_under_bytes{%(matchers)s} > 0) - ) * 100 - ||| % u.matchers(), - null, - 'table', - 18, - 0, - 3, - 3 - ), - PoolOverviewSingleStatPanel( - 'none', - 'Compression Factor', - 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', - 'current', - ||| - sum( - ceph_pool_compress_under_bytes{%(matchers)s} > 0) - / sum(ceph_pool_compress_bytes_used{%(matchers)s} > 0 - ) - ||| % u.matchers(), - null, - '', - 21, - 0, - 3, - 3 - ), - u.addTableSchema( - '$datasource', - '', - { col: 5, desc: true }, - [ - PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []), - PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []), - PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []), - PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []), - PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []), - PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []), - PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70', '85'], []), - PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []), - PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []), - PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []), - PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []), - PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []), - PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []), - PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []), - PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []), - PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []), - PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []), - PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []), - PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []), - PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{ text: 'ON', value: '1' }]), - ], - 'Pool Overview', - 'table' - ) - .addTargets( - [ - u.addTargetSchema( - ||| - ( - ceph_pool_compress_under_bytes{%(matchers)s} / - ceph_pool_compress_bytes_used{%(matchers)s} > 0 - ) and on(pool_id) ( - ( - (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / - ceph_pool_stored_raw{%(matchers)s} - ) * 100 > 0.5 - ) - ||| % u.matchers(), - 'A', - 'table', - 1, - true - ), - u.addTargetSchema( - ||| - ceph_pool_max_avail{%(matchers)s} * - on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} - ||| % u.matchers(), - 'B', - 'table', - 1, - true - ), - u.addTargetSchema( - ||| - ( - (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / - ceph_pool_stored_raw{%(matchers)s} - ) * 100 - ||| % u.matchers(), - 'C', - 'table', - 1, - true - ), - u.addTargetSchema( - ||| - ceph_pool_percent_used{%(matchers)s} * - on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} - ||| % u.matchers(), - 'D', - 'table', - 1, - true - ), - u.addTargetSchema( - ||| - ceph_pool_compress_under_bytes{%(matchers)s} - - ceph_pool_compress_bytes_used{%(matchers)s} > 0 - ||| % u.matchers(), - 'E', - 'table', - 1, - true - ), - u.addTargetSchema( - 'delta(ceph_pool_stored{%(matchers)s}[5d])' % u.matchers(), 'F', 'table', 1, true - ), - u.addTargetSchema( - ||| - rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) - + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) - ||| % u.matchers(), - 'G', - 'table', - 1, - true - ), - u.addTargetSchema( - ||| - rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + - rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) - ||| % u.matchers(), - 'H', - 'table', - 1, - true - ), - u.addTargetSchema( - 'ceph_pool_metadata{%(matchers)s}' % u.matchers(), 'I', 'table', 1, true - ), - u.addTargetSchema( - 'ceph_pool_stored{%(matchers)s} * on(pool_id) group_left ceph_pool_metadata{%(matchers)s}' % u.matchers(), - 'J', - 'table', - 1, - true - ), - u.addTargetSchema( - 'ceph_pool_metadata{%(matchers)s, compression_mode!="none"}' % u.matchers(), 'K', 'table', 1, true - ), - u.addTargetSchema('', 'L', '', '', null), - ] - ) + { gridPos: { x: 0, y: 3, w: 24, h: 6 } }, - PoolOverviewGraphPanel( - 'Top $topk Client IOPS by Pool', - 'This chart shows the sum of read and write IOPS from all clients by pool', - 'short', - 'IOPS', - ||| - topk($topk, - round( - ( - rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) + - rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) - ), 1 - ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s}) - ||| % u.matchers(), - '{{name}} ', - 0, - 9, - 12, - 8 - ) - .addTarget( - u.addTargetSchema( - ||| - topk($topk, - rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + - on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s} - ) - ||| % u.matchers(), - '{{name}} - write' - ) - ), - PoolOverviewGraphPanel( - 'Top $topk Client Bandwidth by Pool', - 'The chart shows the sum of read and write bytes from all clients, by pool', - 'Bps', - 'Throughput', - ||| - topk($topk, - ( - rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + - rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) - ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s} - ) - ||| % u.matchers(), - '{{name}}', - 12, - 9, - 12, - 8 - ), - PoolOverviewGraphPanel( - 'Pool Capacity Usage (RAW)', - 'Historical view of capacity usage, to help identify growth and trends in pool consumption', - 'bytes', - 'Capacity Used', - 'ceph_pool_bytes_used{%(matchers)s} * on(pool_id) group_right ceph_pool_metadata{%(matchers)s}' % u.matchers(), - '{{name}}', - 0, - 17, - 24, - 7 - ), - ]), - 'pool-detail.json': - local PoolDetailSingleStatPanel(format, +(import 'utils.libsonnet') { + 'pool-overview.json': + local PoolOverviewSingleStatPanel(format, title, description, valueName, - colorValue, - gaugeMaxValue, - gaugeShow, - sparkLineShow, - thresholds, expr, + instant, targetFormat, x, y, w, h) = - u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], - '$datasource', - format, - title, - description, - valueName, - colorValue, - gaugeMaxValue, - gaugeShow, - sparkLineShow, - thresholds) - .addTarget(u.addTargetSchema(expr, '', targetFormat)) + { gridPos: { x: x, y: y, w: w, h: h } }; + $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget($.addTargetSchema(expr, '', targetFormat, 1, instant)) + { gridPos: { x: x, y: y, w: w, h: h } }; - local PoolDetailGraphPanel(alias, - title, + local PoolOverviewStyle(alias, + pattern, + type, + unit, + colorMode, + thresholds, + valueMaps) = + $.addStyle(alias, + colorMode, + [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + 'YYYY-MM-DD HH:mm:ss', + 2, + 1, + pattern, + thresholds, + type, + unit, + valueMaps); + + local PoolOverviewGraphPanel(title, description, formatY1, labelY1, @@ -471,213 +59,621 @@ local c = (import '../mixin.libsonnet')._config; y, w, h) = - u.graphPanelSchema(alias, - title, - description, - 'null as zero', - false, - formatY1, - 'short', - labelY1, - null, - null, - 1, - '$datasource') - .addTargets( - [u.addTargetSchema(expr, legendFormat)] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; + $.graphPanelSchema({}, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr, + legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'Ceph Pool Details', + $.dashboardSchema( + 'Ceph Pools Overview', + '', + 'z99hzWtmk', + 'now-1h', + '15s', + 22, + $._config.dashboardTags, + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + g.template.custom(label='TopK', + name='topk', + current='15', + query='15') + ) + .addPanels([ + PoolOverviewSingleStatPanel( + 'none', + 'Pools', '', - '-xyV8KCiz', - 'now-1h', - '15s', - 22, - c.dashboardTags, + 'avg', + 'count(ceph_pool_metadata{%(matchers)s})' % $.matchers(), + true, + 'table', + 0, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'none', + 'Pools with Compression', + 'Count of the pools that have compression enabled', + 'current', + 'count(ceph_pool_metadata{%(matchers)s, compression_mode!="none"})' % $.matchers(), + null, '', - { - refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() + 3, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Total Raw Capacity', + 'Total raw capacity available to the cluster', + 'current', + 'sum(ceph_osd_stat_bytes{%(matchers)s})' % $.matchers(), + null, + '', + 6, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Raw Capacity Consumed', + 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', + 'current', + 'sum(ceph_pool_bytes_used{%(matchers)s})' % $.matchers(), + true, + '', + 9, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Logical Stored ', + 'Total of client data stored in the cluster', + 'current', + 'sum(ceph_pool_stored{%(matchers)s})' % $.matchers(), + true, + '', + 12, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Compression Savings', + 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', + 'current', + ||| + sum( + ceph_pool_compress_under_bytes{%(matchers)s} - + ceph_pool_compress_bytes_used{%(matchers)s} + ) + ||| % $.matchers(), + null, + '', + 15, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'percent', + 'Compression Eligibility', + 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data', + 'current', + ||| + ( + sum(ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + sum(ceph_pool_stored_raw{%(matchers)s} and ceph_pool_compress_under_bytes{%(matchers)s} > 0) + ) * 100 + ||| % $.matchers(), + null, + 'table', + 18, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'none', + 'Compression Factor', + 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', + 'current', + ||| + sum( + ceph_pool_compress_under_bytes{%(matchers)s} > 0) + / sum(ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ) + ||| % $.matchers(), + null, + '', + 21, + 0, + 3, + 3 + ), + $.addTableSchema( + '$datasource', + '', + { col: 5, desc: true }, + [ + PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []), + PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []), + PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []), + PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70', '85'], []), + PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []), + PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []), + PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []), + PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []), + PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []), + PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []), + PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []), + PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []), + PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{ text: 'ON', value: '1' }]), + ], + 'Pool Overview', + 'table' ) - .addTemplate( - u.addTemplateSchema('pool_name', - '$datasource', - 'label_values(ceph_pool_metadata{%(matchers)s}, name)' % u.matchers(), - 1, - false, - 1, - 'Pool Name', - '') + .addTargets( + [ + $.addTargetSchema( + ||| + ( + ceph_pool_compress_under_bytes{%(matchers)s} / + ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ) and on(pool_id) ( + ( + (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + ceph_pool_stored_raw{%(matchers)s} + ) * 100 > 0.5 + ) + ||| % $.matchers(), + 'A', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ceph_pool_max_avail{%(matchers)s} * + on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} + ||| % $.matchers(), + 'B', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ( + (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + ceph_pool_stored_raw{%(matchers)s} + ) * 100 + ||| % $.matchers(), + 'C', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ceph_pool_percent_used{%(matchers)s} * + on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} + ||| % $.matchers(), + 'D', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + ceph_pool_compress_under_bytes{%(matchers)s} - + ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ||| % $.matchers(), + 'E', + 'table', + 1, + true + ), + $.addTargetSchema( + 'delta(ceph_pool_stored{%(matchers)s}[5d])' % $.matchers(), 'F', 'table', 1, true + ), + $.addTargetSchema( + ||| + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + 'G', + 'table', + 1, + true + ), + $.addTargetSchema( + ||| + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + ||| % $.matchers(), + 'H', + 'table', + 1, + true + ), + $.addTargetSchema( + 'ceph_pool_metadata{%(matchers)s}' % $.matchers(), 'I', 'table', 1, true + ), + $.addTargetSchema( + 'ceph_pool_stored{%(matchers)s} * on(pool_id) group_left ceph_pool_metadata{%(matchers)s}' % $.matchers(), + 'J', + 'table', + 1, + true + ), + $.addTargetSchema( + 'ceph_pool_metadata{%(matchers)s, compression_mode!="none"}' % $.matchers(), 'K', 'table', 1, true + ), + $.addTargetSchema('', 'L', '', '', null), + ] + ) + { gridPos: { x: 0, y: 3, w: 24, h: 6 } }, + PoolOverviewGraphPanel( + 'Top $topk Client IOPS by Pool', + 'This chart shows the sum of read and write IOPS from all clients by pool', + 'short', + 'IOPS', + ||| + topk($topk, + round( + ( + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + ), 1 + ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s}) + ||| % $.matchers(), + '{{name}} ', + 0, + 9, + 12, + 8 ) - .addPanels([ - PoolDetailSingleStatPanel( - 'percentunit', - 'Capacity used', - '', - 'current', - true, - 1, - true, - true, - '.7,.8', - ||| - (ceph_pool_stored{%(matchers)s} / (ceph_pool_stored{%(matchers)s} + ceph_pool_max_avail{%(matchers)s})) * - on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} - ||| % u.matchers(), - 'time_series', - 0, - 0, - 7, - 7 - ), - PoolDetailSingleStatPanel( - 's', - 'Time till full', - 'Time till pool is full assuming the average fill rate of the last 4 hours', - false, - 100, - false, - false, - '', - 'current', + .addTarget( + $.addTargetSchema( ||| - (ceph_pool_max_avail{%(matchers)s} / deriv(ceph_pool_stored{%(matchers)s}[6h])) * - on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} > 0 - ||| % u.matchers(), - 'time_series', - 7, - 0, - 5, - 7 - ), - PoolDetailGraphPanel( - { - read_op_per_sec: - '#3F6833', - write_op_per_sec: '#E5AC0E', - }, - '$pool_name Object Ingress/Egress', - '', - 'ops', - 'Objects out(-) / in(+) ', - ||| - deriv(ceph_pool_objects{%(matchers)s}[1m]) * - on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} - ||| % u.matchers(), - 'Objects per second', - 12, - 0, - 12, - 7 - ), - PoolDetailGraphPanel( - { - read_op_per_sec: '#3F6833', - write_op_per_sec: '#E5AC0E', - }, - '$pool_name Client IOPS', - '', - 'iops', - 'Read (-) / Write (+)', - ||| - rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) * - on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} - ||| % u.matchers(), - 'reads', - 0, - 7, - 12, - 7 + topk($topk, + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s} + ) + ||| % $.matchers(), + '{{name}} - write' ) - .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) - .addTarget( - u.addTargetSchema( - ||| - rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) * - on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} - ||| % u.matchers(), - 'writes' + ), + PoolOverviewGraphPanel( + 'Top $topk Client Bandwidth by Pool', + 'The chart shows the sum of read and write bytes from all clients, by pool', + 'Bps', + 'Throughput', + ||| + topk($topk, + ( + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s} ) - ), - PoolDetailGraphPanel( - { - read_op_per_sec: '#3F6833', - write_op_per_sec: '#E5AC0E', - }, - '$pool_name Client Throughput', - '', - 'Bps', - 'Read (-) / Write (+)', + ||| % $.matchers(), + '{{name}}', + 12, + 9, + 12, + 8 + ), + PoolOverviewGraphPanel( + 'Pool Capacity Usage (RAW)', + 'Historical view of capacity usage, to help identify growth and trends in pool consumption', + 'bytes', + 'Capacity Used', + 'ceph_pool_bytes_used{%(matchers)s} * on(pool_id) group_right ceph_pool_metadata{%(matchers)s}' % $.matchers(), + '{{name}}', + 0, + 17, + 24, + 7 + ), + ]), + 'pool-detail.json': + local PoolDetailSingleStatPanel(format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds, + expr, + targetFormat, + x, + y, + w, + h) = + $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds) + .addTarget($.addTargetSchema(expr, '', targetFormat)) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local PoolDetailGraphPanel(alias, + title, + description, + formatY1, + labelY1, + expr, + legendFormat, + x, + y, + w, + h) = + $.graphPanelSchema(alias, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr, legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + $.dashboardSchema( + 'Ceph Pool Details', + '', + '-xyV8KCiz', + 'now-1h', + '15s', + 22, + $._config.dashboardTags, + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('pool_name', + '$datasource', + 'label_values(ceph_pool_metadata{%(matchers)s}, name)' % $.matchers(), + 1, + false, + 1, + 'Pool Name', + '') + ) + .addPanels([ + PoolDetailSingleStatPanel( + 'percentunit', + 'Capacity used', + '', + 'current', + true, + 1, + true, + true, + '.7,.8', + ||| + (ceph_pool_stored{%(matchers)s} / (ceph_pool_stored{%(matchers)s} + ceph_pool_max_avail{%(matchers)s})) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % $.matchers(), + 'time_series', + 0, + 0, + 7, + 7 + ), + PoolDetailSingleStatPanel( + 's', + 'Time till full', + 'Time till pool is full assuming the average fill rate of the last 4 hours', + false, + 100, + false, + false, + '', + 'current', + ||| + (ceph_pool_max_avail{%(matchers)s} / deriv(ceph_pool_stored{%(matchers)s}[6h])) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} > 0 + ||| % $.matchers(), + 'time_series', + 7, + 0, + 5, + 7 + ), + PoolDetailGraphPanel( + { + read_op_per_sec: + '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Object Ingress/Egress', + '', + 'ops', + 'Objects out(-) / in(+) ', + ||| + deriv(ceph_pool_objects{%(matchers)s}[1m]) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % $.matchers(), + 'Objects per second', + 12, + 0, + 12, + 7 + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Client IOPS', + '', + 'iops', + 'Read (-) / Write (+)', + ||| + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) * + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % $.matchers(), + 'reads', + 0, + 7, + 12, + 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + $.addTargetSchema( ||| - rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} - ||| % u.matchers(), - 'reads', - 12, - 7, - 12, - 7 + ||| % $.matchers(), + 'writes' ) - .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) - .addTarget( - u.addTargetSchema( - ||| - rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + - on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} - ||| % u.matchers(), - 'writes' - ) - ), - PoolDetailGraphPanel( - { - read_op_per_sec: '#3F6833', - write_op_per_sec: '#E5AC0E', - }, - '$pool_name Objects', - '', - 'short', - 'Objects', + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Client Throughput', + '', + 'Bps', + 'Read (-) / Write (+)', + ||| + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % $.matchers(), + 'reads', + 12, + 7, + 12, + 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + $.addTargetSchema( ||| - ceph_pool_objects{%(matchers)s} * + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} - ||| % u.matchers(), - 'Number of Objects', - 0, - 14, - 12, - 7 - ), - ]), - }, + ||| % $.matchers(), + 'writes' + ) + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Objects', + '', + 'short', + 'Objects', + ||| + ceph_pool_objects{%(matchers)s} * + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % $.matchers(), + 'Number of Objects', + 0, + 14, + 12, + 7 + ), + ]), } diff --git a/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/monitoring/ceph-mixin/dashboards/rbd.libsonnet index ba3db60e53813..0e273ef3d6e5d 100644 --- a/monitoring/ceph-mixin/dashboards/rbd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rbd.libsonnet @@ -2,359 +2,357 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; local c = (import '../mixin.libsonnet')._config; -{ - grafanaDashboards+:: { - 'rbd-details.json': - local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) = - u.graphPanelSchema({}, - title, - '', - 'null as zero', - false, - formatY1, - formatY1, - null, - null, - 0, - 1, - '$datasource') - .addTargets( - [ - u.addTargetSchema(expr1, - '{{pool}} Write'), - u.addTargetSchema(expr2, '{{pool}} Read'), - ] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; +(import 'utils.libsonnet') { + 'rbd-details.json': + local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) = + $.graphPanelSchema({}, + title, + '', + 'null as zero', + false, + formatY1, + formatY1, + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema(expr1, + '{{pool}} Write'), + $.addTargetSchema(expr2, '{{pool}} Read'), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'RBD Details', - 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)', - 'YhCYGcuZz', - 'now-1h', - false, - 16, - c.dashboardTags, - '', - { - refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() - ) - .addTemplate( - u.addTemplateSchema('pool', - '$datasource', - 'label_values(pool)', - 1, - false, - 0, - '', - '') - ) - .addTemplate( - u.addTemplateSchema('image', - '$datasource', - 'label_values(image)', - 1, - false, - 0, - '', - '') + $.dashboardSchema( + 'RBD Details', + 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)', + 'YhCYGcuZz', + 'now-1h', + false, + 16, + $._config.dashboardTags, + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' ) - .addPanels([ - RbdDetailsPanel( - 'IOPS', - 'iops', - 'rate(ceph_rbd_write_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers() - , - 'rate(ceph_rbd_read_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(), - 0, - 0, - 8, - 9 - ), - RbdDetailsPanel( - 'Throughput', - 'Bps', - 'rate(ceph_rbd_write_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(), - 'rate(ceph_rbd_read_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(), - 8, - 0, - 8, - 9 - ), - RbdDetailsPanel( - 'Average Latency', - 'ns', - ||| - rate(ceph_rbd_write_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) / - rate(ceph_rbd_write_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) - ||| % u.matchers(), - ||| - rate(ceph_rbd_read_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) / - rate(ceph_rbd_read_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) - ||| % u.matchers(), - 16, - 0, - 8, - 9 - ), - ]), - 'rbd-overview.json': - local RgwOverviewStyle(alias, pattern, type, unit) = - u.addStyle(alias, - null, - ['rgba(245, 54, 54, 0.9)', 'rgba(237, 129, 40, 0.89)', 'rgba(50, 172, 45, 0.97)'], - 'YYYY-MM-DD HH:mm:ss', - 2, - 1, - pattern, - [], - type, - unit, - []); - local RbdOverviewPanel(title, - formatY1, - expr1, - expr2, - legendFormat1, - legendFormat2, - x, - y, - w, - h) = - u.graphPanelSchema({}, - title, - '', - 'null', - false, + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('pool', + '$datasource', + 'label_values(pool)', + 1, + false, + 0, + '', + '') + ) + .addTemplate( + $.addTemplateSchema('image', + '$datasource', + 'label_values(image)', + 1, + false, + 0, + '', + '') + ) + .addPanels([ + RbdDetailsPanel( + 'IOPS', + 'iops', + 'rate(ceph_rbd_write_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers() + , + 'rate(ceph_rbd_read_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(), + 0, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Throughput', + 'Bps', + 'rate(ceph_rbd_write_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(), + 'rate(ceph_rbd_read_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(), + 8, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Average Latency', + 'ns', + ||| + rate(ceph_rbd_write_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) / + rate(ceph_rbd_write_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) + ||| % $.matchers(), + ||| + rate(ceph_rbd_read_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) / + rate(ceph_rbd_read_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) + ||| % $.matchers(), + 16, + 0, + 8, + 9 + ), + ]), + 'rbd-overview.json': + local RgwOverviewStyle(alias, pattern, type, unit) = + $.addStyle(alias, + null, + ['rgba(245, 54, 54, 0.9)', 'rgba(237, 129, 40, 0.89)', 'rgba(50, 172, 45, 0.97)'], + 'YYYY-MM-DD HH:mm:ss', + 2, + 1, + pattern, + [], + type, + unit, + []); + local RbdOverviewPanel(title, formatY1, - 'short', - null, - null, - 0, - 1, - '$datasource') - .addTargets( - [ - u.addTargetSchema(expr1, - legendFormat1), - u.addTargetSchema(expr2, - legendFormat2), - ] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + $.graphPanelSchema({}, + title, + '', + 'null', + false, + formatY1, + 'short', + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema(expr1, + legendFormat1), + $.addTargetSchema(expr2, + legendFormat2), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'RBD Overview', - '', - '41FrpeUiz', - 'now-1h', - '30s', + $.dashboardSchema( + 'RBD Overview', + '', + '41FrpeUiz', + 'now-1h', + '30s', + 16, + $._config.dashboardTags + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.4.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='datasource', id='prometheus', name='Prometheus', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addPanels([ + RbdOverviewPanel( + 'IOPS', + 'short', + 'round(sum(rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'round(sum(rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'Writes', + 'Reads', + 0, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Throughput', + 'Bps', + 'round(sum(rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'round(sum(rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(), + 'Write', + 'Read', + 8, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Average Latency', + 'ns', + ||| + round( + sum(rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval])) / + sum(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval])) + ) + ||| % $.matchers(), + ||| + round( + sum(rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval])) / + sum(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval])) + ) + ||| % $.matchers(), + 'Write', + 'Read', 16, - c.dashboardTags + ['overview'], + 0, + 8, + 7 + ), + $.addTableSchema( + '$datasource', '', - { - refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.4.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='datasource', id='prometheus', name='Prometheus', version='5.0.0' + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest IOPS', + 'table' ) - .addRequired( - type='panel', id='table', name='Table', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() - ) - .addPanels([ - RbdOverviewPanel( - 'IOPS', - 'short', - 'round(sum(rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval])))' % u.matchers(), - 'round(sum(rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])))' % u.matchers(), - 'Writes', - 'Reads', - 0, - 0, - 8, - 7 - ), - RbdOverviewPanel( - 'Throughput', - 'Bps', - 'round(sum(rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])))' % u.matchers(), - 'round(sum(rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval])))' % u.matchers(), - 'Write', - 'Read', - 8, - 0, - 8, - 7 - ), - RbdOverviewPanel( - 'Average Latency', - 'ns', + .addTarget( + $.addTargetSchema( ||| - round( - sum(rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval])) / - sum(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval])) - ) - ||| % u.matchers(), - ||| - round( - sum(rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval])) / - sum(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval])) + topk(10, + ( + sort(( + rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval]) + + on (image, pool, namespace) rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval]) + )) + ) ) - ||| % u.matchers(), - 'Write', - 'Read', - 16, - 0, - 8, - 7 - ), - u.addTableSchema( - '$datasource', + ||| % $.matchers(), '', - { col: 3, desc: true }, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'), - RgwOverviewStyle('Image', 'image', 'string', 'short'), - RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'), - RgwOverviewStyle('', '/.*/', 'hidden', 'short'), - ], - 'Highest IOPS', - 'table' + 'table', + 1, + true ) - .addTarget( - u.addTargetSchema( - ||| - topk(10, - ( - sort(( - rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval]) + - on (image, pool, namespace) rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval]) - )) - ) + ) + { gridPos: { x: 0, y: 7, w: 8, h: 7 } }, + $.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest Throughput', + 'table' + ) + .addTarget( + $.addTargetSchema( + ||| + topk(10, + sort( + sum( + rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval]) + ) by (pool, image, namespace) ) - ||| % u.matchers(), - '', - 'table', - 1, - true - ) - ) + { gridPos: { x: 0, y: 7, w: 8, h: 7 } }, - u.addTableSchema( - '$datasource', + ) + ||| % $.matchers(), '', - { col: 3, desc: true }, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'), - RgwOverviewStyle('Image', 'image', 'string', 'short'), - RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'), - RgwOverviewStyle('', '/.*/', 'hidden', 'short'), - ], - 'Highest Throughput', - 'table' + 'table', + 1, + true ) - .addTarget( - u.addTargetSchema( - ||| - topk(10, - sort( - sum( - rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval]) + - rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval]) - ) by (pool, image, namespace) - ) - ) - ||| % u.matchers(), - '', - 'table', - 1, - true - ) - ) + { gridPos: { x: 8, y: 7, w: 8, h: 7 } }, - u.addTableSchema( - '$datasource', + ) + { gridPos: { x: 8, y: 7, w: 8, h: 7 } }, + $.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('Latency', 'Value', 'number', 'ns'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest Latency', + 'table' + ) + .addTarget( + $.addTargetSchema( + ||| + topk(10, + sum( + rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval]) / + clamp_min(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]), 1) + + rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval]) / + clamp_min(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]), 1) + ) by (pool, image, namespace) + ) + ||| % $.matchers(), '', - { col: 3, desc: true }, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'), - RgwOverviewStyle('Image', 'image', 'string', 'short'), - RgwOverviewStyle('Latency', 'Value', 'number', 'ns'), - RgwOverviewStyle('', '/.*/', 'hidden', 'short'), - ], - 'Highest Latency', - 'table' + 'table', + 1, + true ) - .addTarget( - u.addTargetSchema( - ||| - topk(10, - sum( - rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval]) / - clamp_min(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]), 1) + - rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval]) / - clamp_min(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]), 1) - ) by (pool, image, namespace) - ) - ||| % u.matchers(), - '', - 'table', - 1, - true - ) - ) + { gridPos: { x: 16, y: 7, w: 8, h: 7 } }, - ]), - }, + ) + { gridPos: { x: 16, y: 7, w: 8, h: 7 } }, + ]), } diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet index 2e17bb75b53ee..4b5bc6fea87ba 100644 --- a/monitoring/ceph-mixin/dashboards/rgw.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -2,880 +2,878 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; local c = (import '../mixin.libsonnet')._config; -{ - grafanaDashboards+:: { - 'radosgw-sync-overview.json': - local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = - u.graphPanelSchema({}, - title, - '', - 'null as zero', - true, - formatY1, - 'short', - labelY1, - null, - 0, - 1, - '$datasource') - .addTargets( - [ - u.addTargetSchema( - 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))' - % (u.matchers() + { rgwMetric: rgwMetric }), - '{{source_zone}}' - ), - ] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; +(import 'utils.libsonnet') { + 'radosgw-sync-overview.json': + local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = + $.graphPanelSchema({}, + title, + '', + 'null as zero', + true, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + $.addTargetSchema( + 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))' + % ($.matchers() + { rgwMetric: rgwMetric }), + '{{source_zone}}' + ), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'RGW Sync Overview', - '', - 'rgw-sync-overview', - 'now-1h', - '15s', - 16, - c.dashboardTags + ['overview'], - '', - { - refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' + $.dashboardSchema( + 'RGW Sync Overview', + '', + 'rgw-sync-overview', + 'now-1h', + '15s', + 16, + $._config.dashboardTags + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() - ) - .addTemplate( - u.addTemplateSchema( - 'rgw_servers', - '$datasource', - 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - true, - 1, - '', - 'RGW Server' - ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + '', + 'RGW Server' ) - .addPanels([ - RgwSyncOverviewPanel( - 'Replication (throughput) from Source Zone', - 'Bps', - null, - 'ceph_data_sync_from_zone_fetch_bytes_sum', - 0, - 0, - 8, - 7 - ), - RgwSyncOverviewPanel( - 'Replication (objects) from Source Zone', - 'short', - 'Objects/s', - 'ceph_data_sync_from_zone_fetch_bytes_count', - 8, - 0, - 8, - 7 - ), - RgwSyncOverviewPanel( - 'Polling Request Latency from Source Zone', - 'ms', - null, - 'ceph_data_sync_from_zone_poll_latency_sum', - 16, - 0, - 8, - 7 - ), - RgwSyncOverviewPanel( - 'Unsuccessful Object Replications from Source Zone', - 'short', - 'Count/s', - 'ceph_data_sync_from_zone_fetch_errors', - 0, - 7, - 8, - 7 - ), - ]), - 'radosgw-overview.json': - local RgwOverviewPanel( + ) + .addPanels([ + RgwSyncOverviewPanel( + 'Replication (throughput) from Source Zone', + 'Bps', + null, + 'ceph_data_sync_from_zone_fetch_bytes_sum', + 0, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Replication (objects) from Source Zone', + 'short', + 'Objects/s', + 'ceph_data_sync_from_zone_fetch_bytes_count', + 8, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Polling Request Latency from Source Zone', + 'ms', + null, + 'ceph_data_sync_from_zone_poll_latency_sum', + 16, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Unsuccessful Object Replications from Source Zone', + 'short', + 'Count/s', + 'ceph_data_sync_from_zone_fetch_errors', + 0, + 7, + 8, + 7 + ), + ]), + 'radosgw-overview.json': + local RgwOverviewPanel( + title, + description, + formatY1, + formatY2, + expr1, + legendFormat1, + x, + y, + w, + h, + datasource='$datasource', + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false + ) = + $.graphPanelSchema( + {}, title, description, + 'null', + false, formatY1, formatY2, - expr1, - legendFormat1, - x, - y, - w, - h, - datasource='$datasource', - legend_alignAsTable=false, - legend_avg=false, - legend_min=false, - legend_max=false, - legend_current=false, - legend_values=false - ) = - u.graphPanelSchema( - {}, - title, - description, - 'null', - false, - formatY1, - formatY2, - null, - null, - 0, - 1, - datasource, - legend_alignAsTable, - legend_avg, - legend_min, - legend_max, - legend_current, - legend_values - ) - .addTargets( - [u.addTargetSchema(expr1, legendFormat1)] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; + null, + null, + 0, + 1, + datasource, + legend_alignAsTable, + legend_avg, + legend_min, + legend_max, + legend_current, + legend_values + ) + .addTargets( + [$.addTargetSchema(expr1, legendFormat1)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'RGW Overview', - '', - 'WAkugZpiz', - 'now-1h', - '15s', - 16, - c.dashboardTags + ['overview'], + $.dashboardSchema( + 'RGW Overview', + '', + 'WAkugZpiz', + 'now-1h', + '15s', + 16, + $._config.dashboardTags + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, '', - { - refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } + 'RGW Server' ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) + ) + .addTemplate( + $.addTemplateSchema( + 'code', + '$datasource', + 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)', + 1, + true, + 1, + 'HTTP Code', + '' ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addTemplate( + $.addTemplateSchema( + 'job_haproxy', + '$datasource', + 'label_values(haproxy_server_status, job)', + 1, + true, + 1, + 'job haproxy', + '(.*)', + multi=true, + allValues='.+', + ), + ) + .addTemplate( + $.addTemplateSchema( + 'ingress_service', + '$datasource', + 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)', + 1, + true, + 1, + 'Ingress Service', + '' ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addPanels([ + $.addRowSchema(false, + true, + 'RGW Overview - All Gateways') + + { + gridPos: { x: 0, y: 0, w: 24, h: 1 }, + }, + RgwOverviewPanel( + 'Average GET/PUT Latencies', + '', + 's', + 'short', + ||| + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s} + ||| % $.matchers(), + 'GET AVG', + 0, + 1, + 8, + 7 + ).addTargets( + [ + $.addTargetSchema( + ||| + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s} + ||| % $.matchers(), + 'PUT AVG' + ), + ] + ), + RgwOverviewPanel( + 'Total Requests/sec by RGW Instance', + '', + 'none', + 'short', + ||| + sum by (rgw_host) ( + label_replace( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ) + ||| % $.matchers(), + '{{rgw_host}}', + 8, + 1, + 7, + 7 + ), + RgwOverviewPanel( + 'GET Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 15, + 1, + 6, + 7 + ), + RgwOverviewPanel( + 'Bandwidth Consumed by Type', + 'Total bytes transferred in/out of all radosgw instances within the cluster', + 'bytes', + 'short', + 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % $.matchers(), + 'GETs', + 0, + 8, + 8, + 6 + ).addTargets( + [$.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % $.matchers(), + 'PUTs')] + ), + RgwOverviewPanel( + 'Bandwidth by RGW Instance', + 'Total bytes transferred in/out through get/put operations, by radosgw instance', + 'bytes', + 'short', + ||| + label_replace(sum by (instance_id) ( + rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 8, + 8, + 7, + 6 + ), + RgwOverviewPanel( + 'PUT Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', + 's', + 'short', + ||| + label_replace( + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, + "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" + ) + ||| % $.matchers(), + '{{rgw_host}}', + 15, + 8, + 6, + 6 + ), + $.addRowSchema( + false, true, 'RGW Overview - HAProxy Metrics' + ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } }, + RgwOverviewPanel( + 'Total responses by HTTP code', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval] + ) + ) by (code) + |||, + 'Frontend {{ code }}', + 0, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true ) - .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'default', - label='Data Source') + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval] + ) + ) by (code) + |||, 'Backend {{ code }}' + ), + ] ) - .addTemplate( - u.addClusterTemplate() + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + { alias: '/.*1.*/' }, + { alias: '/.*2.*/' }, + { alias: '/.*3.*/' }, + { alias: '/.*4.*/' }, + { alias: '/.*5.*/' }, + { alias: '/.*other.*/' }, + ]), + RgwOverviewPanel( + 'Total requests / responses', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, + 'Requests', + 5, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true ) - .addTemplate( - u.addJobTemplate() + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Response errors', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Requests errors' + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Backend redispatch', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Backend retry', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Request denied', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"} + ) by (instance) + |||, 'Backend Queued', 'time_series', 2 + ), + ] ) - .addTemplate( - u.addTemplateSchema( - 'rgw_servers', - '$datasource', - 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - true, - 1, - '', - 'RGW Server' - ) + .addSeriesOverride([ + { + alias: '/.*Response.*/', + transform: 'negative-Y', + }, + { + alias: '/.*Backend.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Total number of connections', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, + 'Front', + 10, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true ) - .addTemplate( - u.addTemplateSchema( - 'code', - '$datasource', - 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)', - 1, - true, - 1, - 'HTTP Code', - '' - ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Back' + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) + ) by (instance) + |||, 'Back errors' + ), + ] ) - .addTemplate( - u.addTemplateSchema( - 'job_haproxy', - '$datasource', - 'label_values(haproxy_server_status, job)', - 1, - true, - 1, - 'job haproxy', - '(.*)', - multi=true, - allValues='.+', - ), + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Current total of incoming / outgoing bytes', + '', + 'short', + 'short', + ||| + sum( + rate( + haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, + 'IN Front', + 15, + 12, + 6, + 12, + '$datasource', + true, + true, + true, + true, + true, + true ) - .addTemplate( - u.addTemplateSchema( - 'ingress_service', - '$datasource', - 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)', - 1, - true, - 1, - 'Ingress Service', - '' - ) + .addTargets( + [ + $.addTargetSchema( + ||| + sum( + rate( + haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'OUT Front', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'IN Back', 'time_series', 2 + ), + $.addTargetSchema( + ||| + sum( + rate( + haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] + ) * 8 + ) by (instance) + |||, 'OUT Back', 'time_series', 2 + ), + ] ) - .addPanels([ - u.addRowSchema(false, - true, - 'RGW Overview - All Gateways') + + .addSeriesOverride([ { - gridPos: { x: 0, y: 0, w: 24, h: 1 }, + alias: '/.*OUT.*/', + transform: 'negative-Y', }, - RgwOverviewPanel( - 'Average GET/PUT Latencies', - '', - 's', - 'short', - ||| - rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / - rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s} - ||| % u.matchers(), - 'GET AVG', - 0, - 1, - 8, - 7 - ).addTargets( - [ - u.addTargetSchema( - ||| - rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / - rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s} - ||| % u.matchers(), - 'PUT AVG' - ), - ] - ), - RgwOverviewPanel( - 'Total Requests/sec by RGW Instance', - '', - 'none', - 'short', - ||| - sum by (rgw_host) ( - label_replace( - rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, - "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" - ) - ) - ||| % u.matchers(), - '{{rgw_host}}', - 8, - 1, - 7, - 7 - ), - RgwOverviewPanel( - 'GET Latencies by RGW Instance', - 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', - 's', - 'short', - ||| - label_replace( - rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / - rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, - "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" - ) - ||| % u.matchers(), - '{{rgw_host}}', - 15, - 1, - 6, - 7 - ), - RgwOverviewPanel( - 'Bandwidth Consumed by Type', - 'Total bytes transferred in/out of all radosgw instances within the cluster', - 'bytes', - 'short', - 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % u.matchers(), - 'GETs', - 0, - 8, - 8, - 6 - ).addTargets( - [u.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % u.matchers(), - 'PUTs')] - ), - RgwOverviewPanel( - 'Bandwidth by RGW Instance', - 'Total bytes transferred in/out through get/put operations, by radosgw instance', - 'bytes', - 'short', - ||| - label_replace(sum by (instance_id) ( - rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) + - rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, - "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" - ) - ||| % u.matchers(), - '{{rgw_host}}', - 8, - 8, - 7, - 6 - ), - RgwOverviewPanel( - 'PUT Latencies by RGW Instance', - 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', - 's', - 'short', - ||| - label_replace( - rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / - rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, - "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" - ) - ||| % u.matchers(), - '{{rgw_host}}', - 15, - 8, - 6, - 6 - ), - u.addRowSchema( - false, true, 'RGW Overview - HAProxy Metrics' - ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } }, - RgwOverviewPanel( - 'Total responses by HTTP code', - '', - 'short', - 'short', - ||| - sum( - rate( - haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval] - ) - ) by (code) - |||, - 'Frontend {{ code }}', - 0, - 12, - 5, - 12, - '$datasource', - true, - true, - true, - true, - true, - true - ) - .addTargets( - [ - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval] - ) - ) by (code) - |||, 'Backend {{ code }}' - ), - ] - ) - .addSeriesOverride([ - { - alias: '/.*Back.*/', - transform: 'negative-Y', - }, - { alias: '/.*1.*/' }, - { alias: '/.*2.*/' }, - { alias: '/.*3.*/' }, - { alias: '/.*4.*/' }, - { alias: '/.*5.*/' }, - { alias: '/.*other.*/' }, - ]), - RgwOverviewPanel( - 'Total requests / responses', - '', - 'short', - 'short', - ||| - sum( - rate( - haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, - 'Requests', - 5, - 12, - 5, - 12, - '$datasource', - true, - true, - true, - true, - true, - true - ) - .addTargets( - [ - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, 'Response errors', 'time_series', 2 - ), - u.addTargetSchema( - ||| - sum( - rate( - haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, 'Requests errors' - ), - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, 'Backend redispatch', 'time_series', 2 - ), - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, 'Backend retry', 'time_series', 2 - ), - u.addTargetSchema( - ||| - sum( - rate( - haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, 'Request denied', 'time_series', 2 - ), - u.addTargetSchema( - ||| - sum( - haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"} - ) by (instance) - |||, 'Backend Queued', 'time_series', 2 - ), - ] - ) - .addSeriesOverride([ - { - alias: '/.*Response.*/', - transform: 'negative-Y', - }, - { - alias: '/.*Backend.*/', - transform: 'negative-Y', - }, - ]), - RgwOverviewPanel( - 'Total number of connections', - '', - 'short', - 'short', - ||| - sum( - rate( - haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, - 'Front', - 10, - 12, - 5, - 12, - '$datasource', - true, - true, - true, - true, - true, - true - ) - .addTargets( - [ - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, 'Back' - ), - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) - ) by (instance) - |||, 'Back errors' - ), - ] - ) - .addSeriesOverride([ - { - alias: '/.*Back.*/', - transform: 'negative-Y', - }, - ]), - RgwOverviewPanel( - 'Current total of incoming / outgoing bytes', - '', - 'short', - 'short', - ||| - sum( - rate( - haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) * 8 - ) by (instance) - |||, - 'IN Front', - 15, - 12, - 6, - 12, - '$datasource', - true, - true, - true, - true, - true, - true - ) - .addTargets( - [ - u.addTargetSchema( - ||| - sum( - rate( - haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) * 8 - ) by (instance) - |||, 'OUT Front', 'time_series', 2 - ), - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) * 8 - ) by (instance) - |||, 'IN Back', 'time_series', 2 - ), - u.addTargetSchema( - ||| - sum( - rate( - haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] - ) * 8 - ) by (instance) - |||, 'OUT Back', 'time_series', 2 - ), - ] - ) - .addSeriesOverride([ - { - alias: '/.*OUT.*/', - transform: 'negative-Y', - }, - ]), ]), - 'radosgw-detail.json': - local RgwDetailsPanel(aliasColors, - title, - description, - formatY1, - formatY2, - expr1, - expr2, - legendFormat1, - legendFormat2, - x, - y, - w, - h) = - u.graphPanelSchema(aliasColors, - title, - description, - 'null', - false, - formatY1, - formatY2, - null, - null, - 0, - 1, - '$datasource') - .addTargets( - [u.addTargetSchema(expr1, legendFormat1), u.addTargetSchema(expr2, legendFormat2)] - ) + { gridPos: { x: x, y: y, w: w, h: h } }; + ]), + 'radosgw-detail.json': + local RgwDetailsPanel(aliasColors, + title, + description, + formatY1, + formatY2, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + $.graphPanelSchema(aliasColors, + title, + description, + 'null', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [$.addTargetSchema(expr1, legendFormat1), $.addTargetSchema(expr2, legendFormat2)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; - u.dashboardSchema( - 'RGW Instance Detail', + $.dashboardSchema( + 'RGW Instance Detail', + '', + 'x5ARzZtmk', + 'now-1h', + '15s', + 16, + $._config.dashboardTags + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + $.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', + id='grafana-piechart-panel', + name='Pie Chart', + version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + $.addClusterTemplate() + ) + .addTemplate( + $.addJobTemplate() + ) + .addTemplate( + $.addTemplateSchema('rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), + 1, + true, + 1, + '', + '') + ) + .addPanels([ + $.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + RgwDetailsPanel( + {}, + '$rgw_servers GET/PUT Latencies', '', - 'x5ARzZtmk', - 'now-1h', - '15s', - 16, - c.dashboardTags + ['overview'], + 's', + 'short', + ||| + sum by (instance_id) ( + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + ||| + sum by (instance_id) ( + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'GET {{ceph_daemon}}', + 'PUT {{ceph_daemon}}', + 0, + 1, + 6, + 8 + ), + RgwDetailsPanel( + {}, + 'Bandwidth by HTTP Operation', '', + 'bytes', + 'short', + ||| + rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + ||| + rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) + ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'GETs {{ceph_daemon}}', + 'PUTs {{ceph_daemon}}', + 6, + 1, + 7, + 8 + ), + RgwDetailsPanel( { - refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], - time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], - } - ) - .addAnnotation( - u.addAnnotationSchema( - 1, - '-- Grafana --', - true, - true, - 'rgba(0, 211, 255, 1)', - 'Annotations & Alerts', - 'dashboard' - ) - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', - id='grafana-piechart-panel', - name='Pie Chart', - version='1.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'default', - label='Data Source') - ) - .addTemplate( - u.addClusterTemplate() - ) - .addTemplate( - u.addJobTemplate() + GETs: '#7eb26d', + Other: '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + 'Requests Failed': '#bf1b00', + }, + 'HTTP Request Breakdown', + '', + 'short', + 'short', + ||| + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + ||| + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Requests Failed {{ceph_daemon}}', + 'GETs {{ceph_daemon}}', + 13, + 1, + 7, + 8 ) - .addTemplate( - u.addTemplateSchema('rgw_servers', - '$datasource', - 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), - 1, - true, - 1, - '', - '') + .addTargets( + [ + $.addTargetSchema( + ||| + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'PUTs {{ceph_daemon}}' + ), + $.addTargetSchema( + ||| + ( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Other {{ceph_daemon}}' + ), + ] + ), + $.addPieChartSchema( + { + GETs: '#7eb26d', + 'Other (HEAD,POST,DELETE)': '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + Failures: '#bf1b00', + }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current' ) - .addPanels([ - u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, - RgwDetailsPanel( - {}, - '$rgw_servers GET/PUT Latencies', - '', - 's', - 'short', - ||| - sum by (instance_id) ( - rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / - rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) - ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - ||| - sum by (instance_id) ( - rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / - rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) - ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'GET {{ceph_daemon}}', - 'PUT {{ceph_daemon}}', - 0, - 1, - 6, - 8 - ), - RgwDetailsPanel( - {}, - 'Bandwidth by HTTP Operation', - '', - 'bytes', - 'short', - ||| - rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - ||| - rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) - ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'GETs {{ceph_daemon}}', - 'PUTs {{ceph_daemon}}', - 6, - 1, - 7, - 8 - ), - RgwDetailsPanel( - { - GETs: '#7eb26d', - Other: '#447ebc', - PUTs: '#eab839', - Requests: '#3f2b5b', - 'Requests Failed': '#bf1b00', - }, - 'HTTP Request Breakdown', - '', - 'short', - 'short', - ||| - rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - ||| - rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'Requests Failed {{ceph_daemon}}', - 'GETs {{ceph_daemon}}', - 13, - 1, - 7, - 8 - ) - .addTargets( - [ - u.addTargetSchema( - ||| - rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'PUTs {{ceph_daemon}}' - ), - u.addTargetSchema( - ||| - ( - rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - - ( - rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + - rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) - ) - ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'Other {{ceph_daemon}}' - ), - ] - ), - u.addPieChartSchema( - { - GETs: '#7eb26d', - 'Other (HEAD,POST,DELETE)': '#447ebc', - PUTs: '#eab839', - Requests: '#3f2b5b', - Failures: '#bf1b00', - }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current' - ) - .addTarget(u.addTargetSchema( - ||| - rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'Failures {{ceph_daemon}}' - )) - .addTarget(u.addTargetSchema( - ||| - rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'GETs {{ceph_daemon}}' - )) - .addTarget(u.addTargetSchema( - ||| - rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'PUTs {{ceph_daemon}}' - )) - .addTarget(u.addTargetSchema( - ||| - ( - rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - - ( - rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + - rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) - ) - ) * on (instance_id) group_left (ceph_daemon) - ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} - ||| % u.matchers(), - 'Other (DELETE,LIST) {{ceph_daemon}}' - )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } }, - ]), - }, + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Failures {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'GETs {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'PUTs {{ceph_daemon}}' + )) + .addTarget($.addTargetSchema( + ||| + ( + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) + ) * on (instance_id) group_left (ceph_daemon) + ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % $.matchers(), + 'Other (DELETE,LIST) {{ceph_daemon}}' + )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } }, + ]), } diff --git a/monitoring/ceph-mixin/dashboards/utils.libsonnet b/monitoring/ceph-mixin/dashboards/utils.libsonnet index b5d3500a4e31e..d0c007db142e4 100644 --- a/monitoring/ceph-mixin/dashboards/utils.libsonnet +++ b/monitoring/ceph-mixin/dashboards/utils.libsonnet @@ -1,7 +1,8 @@ local g = import 'grafonnet/grafana.libsonnet'; -local c = (import '../mixin.libsonnet')._config; { + _config:: error 'must provide _config', + dashboardSchema(title, description, uid, @@ -180,7 +181,7 @@ local c = (import '../mixin.libsonnet')._config; matchers():: local jobMatcher = 'job=~"$job"'; - local clusterMatcher = '%s=~"$cluster"' % c.clusterLabel; + local clusterMatcher = '%s=~"$cluster"' % $._config.clusterLabel; { // Common labels jobMatcher: jobMatcher, @@ -198,7 +199,7 @@ local c = (import '../mixin.libsonnet')._config; 1, 'cluster', '(.*)', - if !c.showMultiCluster then 'variable' else '', + if !$._config.showMultiCluster then 'variable' else '', multi=true, allValues='.+', ), diff --git a/monitoring/ceph-mixin/mixin.libsonnet b/monitoring/ceph-mixin/mixin.libsonnet index c89b2a916a891..3c983a300195b 100644 --- a/monitoring/ceph-mixin/mixin.libsonnet +++ b/monitoring/ceph-mixin/mixin.libsonnet @@ -1,3 +1,3 @@ (import 'config.libsonnet') + -(import 'dashboards/dashboards.libsonnet') + +(import 'dashboards.libsonnet') + (import 'alerts.libsonnet')