--- /dev/null
+{
+ grafanaDashboards+::
+ (import 'dashboards/cephfs.libsonnet') +
+ (import 'dashboards/host.libsonnet') +
+ (import 'dashboards/osd.libsonnet') +
+ (import 'dashboards/pool.libsonnet') +
+ (import 'dashboards/rbd.libsonnet') +
+ (import 'dashboards/rgw.libsonnet') +
+ { _config:: $._config },
+}
local g = import 'grafonnet/grafana.libsonnet';
-local u = import 'utils.libsonnet';
-local c = (import '../mixin.libsonnet')._config;
-{
- grafanaDashboards+:: {
- 'cephfs-overview.json':
- local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) =
- u.graphPanelSchema({},
- title,
- '',
- 'null',
- false,
- formatY1,
- 'short',
- labelY1,
- null,
- 0,
- 1,
- '$datasource')
- .addTargets(
- [u.addTargetSchema(expr, legendFormat)]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+(import 'utils.libsonnet') {
+ 'cephfs-overview.json':
+ local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) =
+ $.graphPanelSchema({},
+ title,
+ '',
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr, legendFormat)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'MDS Performance',
- '',
- 'tbO9LAiZz',
- 'now-1h',
- '15s',
- 16,
- c.dashboardTags,
- '',
- {
- refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
+ $.dashboardSchema(
+ 'MDS Performance',
+ '',
+ 'tbO9LAiZz',
+ 'now-1h',
+ '15s',
+ 16,
+ $._config.dashboardTags,
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
)
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('mds_servers',
+ '$datasource',
+ 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ 'MDS Server',
+ '')
+ )
+ .addPanels([
+ $.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ CephfsOverviewGraphPanel(
+ 'MDS Workload - $mds_servers',
+ 'none',
+ 'Reads(-) / Writes (+)',
+ 'sum(rate(ceph_objecter_op_r{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % $.matchers(),
+ 'Read Ops',
+ 0,
+ 1,
+ 12,
+ 9
)
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.3.2'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
- )
- .addTemplate(
- u.addTemplateSchema('mds_servers',
- '$datasource',
- 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- true,
- 1,
- 'MDS Server',
- '')
- )
- .addPanels([
- u.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
- CephfsOverviewGraphPanel(
- 'MDS Workload - $mds_servers',
- 'none',
- 'Reads(-) / Writes (+)',
- 'sum(rate(ceph_objecter_op_r{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % u.matchers(),
- 'Read Ops',
- 0,
- 1,
- 12,
- 9
- )
- .addTarget(u.addTargetSchema(
- 'sum(rate(ceph_objecter_op_w{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % u.matchers(),
- 'Write Ops'
- ))
- .addSeriesOverride(
- { alias: '/.*Reads/', transform: 'negative-Y' }
- ),
- CephfsOverviewGraphPanel(
- 'Client Request Load - $mds_servers',
- 'none',
- 'Client Requests',
- 'ceph_mds_server_handle_client_request{%(matchers)s, ceph_daemon=~"($mds_servers).*"}' % u.matchers(),
- '{{ceph_daemon}}',
- 12,
- 1,
- 12,
- 9
- ),
- ]),
- },
+ .addTarget($.addTargetSchema(
+ 'sum(rate(ceph_objecter_op_w{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % $.matchers(),
+ 'Write Ops'
+ ))
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ CephfsOverviewGraphPanel(
+ 'Client Request Load - $mds_servers',
+ 'none',
+ 'Client Requests',
+ 'ceph_mds_server_handle_client_request{%(matchers)s, ceph_daemon=~"($mds_servers).*"}' % $.matchers(),
+ '{{ceph_daemon}}',
+ 12,
+ 1,
+ 12,
+ 9
+ ),
+ ]),
}
+++ /dev/null
-(import '../config.libsonnet') +
-(import 'cephfs.libsonnet') +
-(import 'host.libsonnet') +
-(import 'osd.libsonnet') +
-(import 'pool.libsonnet') +
-(import 'rbd.libsonnet') +
-(import 'rgw.libsonnet')
local g = import 'grafonnet/grafana.libsonnet';
-local u = import 'utils.libsonnet';
-local c = (import '../mixin.libsonnet')._config;
-{
- grafanaDashboards+:: {
- 'hosts-overview.json':
- local HostsOverviewSingleStatPanel(format,
- title,
- description,
- valueName,
- expr,
- instant,
- x,
- y,
- w,
- h) =
- u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
- '$datasource',
- format,
- title,
- description,
- valueName,
- false,
- 100,
- false,
- false,
- '')
- .addTarget(
- u.addTargetSchema(expr, '', 'time_series', 1, instant)
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
-
- local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) =
- u.graphPanelSchema(
- {}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource'
- )
- .addTargets(
- [u.addTargetSchema(
- expr, legendFormat
- )]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
-
- u.dashboardSchema(
- 'Host Overview',
- '',
- 'y0KGL0iZz',
- 'now-1h',
- '10s',
- 16,
- c.dashboardTags,
- '',
- {
- refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.3.2'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addRequired(
- type='panel', id='singlestat', name='Singlestat', version='5.0.0'
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addTemplate(
- g.template.datasource('datasource',
- 'prometheus',
- 'default',
- label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
- )
- .addTemplate(
- u.addTemplateSchema('osd_hosts',
- '$datasource',
- 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % u.matchers(),
- 1,
- true,
- 1,
- null,
- '([^.]*).*')
- )
- .addTemplate(
- u.addTemplateSchema('mon_hosts',
- '$datasource',
- 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- true,
- 1,
- null,
- 'mon.(.*)')
- )
- .addTemplate(
- u.addTemplateSchema('mds_hosts',
- '$datasource',
- 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- true,
- 1,
- null,
- 'mds.(.*)')
- )
- .addTemplate(
- u.addTemplateSchema('rgw_hosts',
- '$datasource',
- 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- true,
- 1,
- null,
- 'rgw.(.*)')
- )
- .addPanels([
- HostsOverviewSingleStatPanel(
- 'none',
- 'OSD Hosts',
- '',
- 'current',
- 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % u.matchers(),
- true,
- 0,
- 0,
- 4,
- 5
- ),
- HostsOverviewSingleStatPanel(
- 'percentunit',
- 'AVG CPU Busy',
- 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster',
- 'current',
- |||
- avg(1 - (
- avg by(instance) (
- rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
- rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
- )
- ))
- |||,
- true,
- 4,
- 0,
- 4,
- 5
- ),
- HostsOverviewSingleStatPanel(
- 'percentunit',
- 'AVG RAM Utilization',
- 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)',
- 'current',
- |||
- avg ((
- (
- node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
- node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
- ) - ((
- node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
- node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +
- (
- node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
- node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
- ) + (
- node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
- node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
- ) + (
- node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
- node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
- )
- )
- ) / (
- node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
- node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"}
- ))
- |||,
- true,
- 8,
- 0,
- 4,
- 5
- ),
- HostsOverviewSingleStatPanel(
- 'none',
- 'Physical IOPS',
- 'IOPS Load at the device as reported by the OS on all OSD hosts',
- 'current',
- |||
- sum ((
- rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
- rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
- ) + (
- rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
- rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
- ))
- |||,
- true,
- 12,
- 0,
- 4,
- 5
- ),
- HostsOverviewSingleStatPanel(
- 'percent',
- 'AVG Disk Utilization',
- 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)',
- 'current',
- |||
- avg (
- label_replace(
- (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or
- (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),
- "instance", "$1", "instance", "([^.:]*).*"
- ) * on(instance, device) group_left(ceph_daemon) label_replace(
- label_replace(
- ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"},
- "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^.:]*).*"
- )
- )
- ||| % u.matchers(),
- true,
- 16,
- 0,
- 4,
- 5
- ),
- HostsOverviewSingleStatPanel(
- 'bytes',
- 'Network Load',
- 'Total send/receive network load across all hosts in the ceph cluster',
- 'current',
- |||
- sum (
- (
- rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
- rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
- ) unless on (device, instance)
- label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
- ) +
- sum (
- (
- rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
- rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
- ) unless on (device, instance)
- label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
- )
- |||
- ,
- true,
- 20,
- 0,
- 4,
- 5
- ),
- HostsOverviewGraphPanel(
- 'CPU Busy - Top 10 Hosts',
- 'Show the top 10 busiest hosts by cpu',
- 'percent',
- |||
- topk(10,
- 100 * (
- 1 - (
- avg by(instance) (
- rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
- rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
- )
- )
- )
- )
- |||,
- '{{instance}}',
- 0,
- 5,
- 12,
- 9
- ),
- HostsOverviewGraphPanel(
- 'Network Load - Top 10 Hosts', 'Top 10 hosts by network load', 'Bps', |||
- topk(10, (sum by(instance) (
- (
- rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
- rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
- ) +
- (
- rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
- rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
- ) unless on (device, instance)
- label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)"))
- ))
- |||
- , '{{instance}}', 12, 5, 12, 9
- ),
- ]),
- 'host-details.json':
- local HostDetailsSingleStatPanel(format,
+(import 'utils.libsonnet') {
+ 'hosts-overview.json':
+ local HostsOverviewSingleStatPanel(format,
title,
description,
valueName,
expr,
+ instant,
x,
y,
w,
h) =
- u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
- '$datasource',
- format,
- title,
- description,
- valueName,
- false,
- 100,
- false,
- false,
- '')
- .addTarget(u.addTargetSchema(expr)) + { gridPos: { x: x, y: y, w: w, h: h } };
+ $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
+ '$datasource',
+ format,
+ title,
+ description,
+ valueName,
+ false,
+ 100,
+ false,
+ false,
+ '')
+ .addTarget(
+ $.addTargetSchema(expr, '', 'time_series', 1, instant)
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- local HostDetailsGraphPanel(alias,
- title,
- description,
- nullPointMode,
- formatY1,
- labelY1,
- expr,
- legendFormat,
- x,
- y,
- w,
- h) =
- u.graphPanelSchema(alias,
- title,
- description,
- nullPointMode,
- false,
- formatY1,
- 'short',
- labelY1,
- null,
- null,
- 1,
- '$datasource')
- .addTargets(
- [u.addTargetSchema(expr, legendFormat)]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+ local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) =
+ $.graphPanelSchema(
+ {}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource'
+ )
+ .addTargets(
+ [$.addTargetSchema(
+ expr, legendFormat
+ )]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'Host Details',
+ $.dashboardSchema(
+ 'Host Overview',
+ '',
+ 'y0KGL0iZz',
+ 'now-1h',
+ '10s',
+ 16,
+ $._config.dashboardTags,
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='singlestat', name='Singlestat', version='5.0.0'
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('osd_hosts',
+ '$datasource',
+ 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ '([^.]*).*')
+ )
+ .addTemplate(
+ $.addTemplateSchema('mon_hosts',
+ '$datasource',
+ 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ 'mon.(.*)')
+ )
+ .addTemplate(
+ $.addTemplateSchema('mds_hosts',
+ '$datasource',
+ 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ 'mds.(.*)')
+ )
+ .addTemplate(
+ $.addTemplateSchema('rgw_hosts',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ null,
+ 'rgw.(.*)')
+ )
+ .addPanels([
+ HostsOverviewSingleStatPanel(
+ 'none',
+ 'OSD Hosts',
'',
- 'rtOg0AiWz',
- 'now-1h',
- '10s',
+ 'current',
+ 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(),
+ true,
+ 0,
+ 0,
+ 4,
+ 5
+ ),
+ HostsOverviewSingleStatPanel(
+ 'percentunit',
+ 'AVG CPU Busy',
+ 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster',
+ 'current',
+ |||
+ avg(1 - (
+ avg by(instance) (
+ rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
+ rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
+ )
+ ))
+ |||,
+ true,
+ 4,
+ 0,
+ 4,
+ 5
+ ),
+ HostsOverviewSingleStatPanel(
+ 'percentunit',
+ 'AVG RAM Utilization',
+ 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)',
+ 'current',
+ |||
+ avg ((
+ (
+ node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ ) - ((
+ node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +
+ (
+ node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ ) + (
+ node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ ) + (
+ node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
+ )
+ )
+ ) / (
+ node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
+ node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"}
+ ))
+ |||,
+ true,
+ 8,
+ 0,
+ 4,
+ 5
+ ),
+ HostsOverviewSingleStatPanel(
+ 'none',
+ 'Physical IOPS',
+ 'IOPS Load at the device as reported by the OS on all OSD hosts',
+ 'current',
+ |||
+ sum ((
+ rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
+ rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
+ ) + (
+ rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
+ rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
+ ))
+ |||,
+ true,
+ 12,
+ 0,
+ 4,
+ 5
+ ),
+ HostsOverviewSingleStatPanel(
+ 'percent',
+ 'AVG Disk Utilization',
+ 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)',
+ 'current',
+ |||
+ avg (
+ label_replace(
+ (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or
+ (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),
+ "instance", "$1", "instance", "([^.:]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^.:]*).*"
+ )
+ )
+ ||| % $.matchers(),
+ true,
16,
- c.dashboardTags + ['overview'],
- '',
- {
- refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.3.2'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addRequired(
- type='panel', id='singlestat', name='Singlestat', version='5.0.0'
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard'
- )
- )
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
- )
- .addTemplate(
- u.addTemplateSchema('ceph_hosts',
+ 0,
+ 4,
+ 5
+ ),
+ HostsOverviewSingleStatPanel(
+ 'bytes',
+ 'Network Load',
+ 'Total send/receive network load across all hosts in the ceph cluster',
+ 'current',
+ |||
+ sum (
+ (
+ rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) unless on (device, instance)
+ label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
+ ) +
+ sum (
+ (
+ rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) unless on (device, instance)
+ label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
+ )
+ |||
+ ,
+ true,
+ 20,
+ 0,
+ 4,
+ 5
+ ),
+ HostsOverviewGraphPanel(
+ 'CPU Busy - Top 10 Hosts',
+ 'Show the top 10 busiest hosts by cpu',
+ 'percent',
+ |||
+ topk(10,
+ 100 * (
+ 1 - (
+ avg by(instance) (
+ rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
+ rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
+ )
+ )
+ )
+ )
+ |||,
+ '{{instance}}',
+ 0,
+ 5,
+ 12,
+ 9
+ ),
+ HostsOverviewGraphPanel(
+ 'Network Load - Top 10 Hosts', 'Top 10 hosts by network load', 'Bps', |||
+ topk(10, (sum by(instance) (
+ (
+ rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) +
+ (
+ rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
+ rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
+ ) unless on (device, instance)
+ label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)"))
+ ))
+ |||
+ , '{{instance}}', 12, 5, 12, 9
+ ),
+ ]),
+ 'host-details.json':
+ local HostDetailsSingleStatPanel(format,
+ title,
+ description,
+ valueName,
+ expr,
+ x,
+ y,
+ w,
+ h) =
+ $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
'$datasource',
- 'label_values({%(clusterMatcher)s}, instance)' % u.matchers(),
- 1,
+ format,
+ title,
+ description,
+ valueName,
+ false,
+ 100,
false,
- 3,
- 'Hostname',
- '([^.:]*).*')
+ false,
+ '')
+ .addTarget($.addTargetSchema(expr)) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ local HostDetailsGraphPanel(alias,
+ title,
+ description,
+ nullPointMode,
+ formatY1,
+ labelY1,
+ expr,
+ legendFormat,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema(alias,
+ title,
+ description,
+ nullPointMode,
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ null,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr, legendFormat)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'Host Details',
+ '',
+ 'rtOg0AiWz',
+ 'now-1h',
+ '10s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='singlestat', name='Singlestat', version='5.0.0'
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard'
)
- .addPanels([
- u.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
- HostDetailsSingleStatPanel(
- 'none',
- 'OSDs',
- '',
- 'current',
- "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % u.matchers(),
- 0,
- 1,
- 3,
- 5
- ),
- HostDetailsGraphPanel(
- {
- interrupt: '#447EBC',
- steal: '#6D1F62',
- system: '#890F02',
- user: '#3F6833',
- wait: '#C15C17',
- },
- 'CPU Utilization',
- "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
- 'null',
- 'percent',
- '% Utilization',
- |||
- sum by (mode) (
- rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or
- rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval])
- ) / (
- scalar(
- sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]))
- ) * 100
- )
- |||,
- '{{mode}}',
- 3,
- 1,
- 6,
- 10
- ),
- HostDetailsGraphPanel(
- {
- Available: '#508642',
- Free: '#508642',
- Total: '#bf1b00',
- Used: '#bf1b00',
- total: '#bf1b00',
- used: '#0a50a1',
- },
- 'RAM Usage',
- '',
- 'null',
- 'bytes',
- 'RAM used',
- |||
- node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
- node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- |||,
- 'Free',
- 9,
- 1,
- 6,
- 10
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('ceph_hosts',
+ '$datasource',
+ 'label_values({%(clusterMatcher)s}, instance)' % $.matchers(),
+ 1,
+ false,
+ 3,
+ 'Hostname',
+ '([^.:]*).*')
+ )
+ .addPanels([
+ $.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ HostDetailsSingleStatPanel(
+ 'none',
+ 'OSDs',
+ '',
+ 'current',
+ "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % $.matchers(),
+ 0,
+ 1,
+ 3,
+ 5
+ ),
+ HostDetailsGraphPanel(
+ {
+ interrupt: '#447EBC',
+ steal: '#6D1F62',
+ system: '#890F02',
+ user: '#3F6833',
+ wait: '#C15C17',
+ },
+ 'CPU Utilization',
+ "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
+ 'null',
+ 'percent',
+ '% Utilization',
+ |||
+ sum by (mode) (
+ rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or
+ rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval])
+ ) / (
+ scalar(
+ sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]))
+ ) * 100
+ )
+ |||,
+ '{{mode}}',
+ 3,
+ 1,
+ 6,
+ 10
+ ),
+ HostDetailsGraphPanel(
+ {
+ Available: '#508642',
+ Free: '#508642',
+ Total: '#bf1b00',
+ Used: '#bf1b00',
+ total: '#bf1b00',
+ used: '#0a50a1',
+ },
+ 'RAM Usage',
+ '',
+ 'null',
+ 'bytes',
+ 'RAM used',
+ |||
+ node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ |||,
+ 'Free',
+ 9,
+ 1,
+ 6,
+ 10
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ |||,
+ 'total'
+ ),
+ $.addTargetSchema(
+ |||
+ (
+ node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) + (
+ node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) + (
+ node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ )
+ |||,
+ 'buffers/cache'
+ ),
+ $.addTargetSchema(
+ |||
+ (
node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
- node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- |||,
- 'total'
- ),
- u.addTargetSchema(
- |||
+ node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) - (
(
+ node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
+ node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
+ ) + (
node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
) + (
node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- ) + (
+ ) +
+ (
node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
)
- |||,
- 'buffers/cache'
- ),
- u.addTargetSchema(
- |||
- (
- node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
- node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- ) - (
- (
- node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
- node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- ) + (
- node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
- node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- ) + (
- node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
- node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- ) +
- (
- node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
- node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
- )
- )
- |||,
- 'used'
- ),
- ]
- )
- .addSeriesOverride(
- {
- alias: 'total',
- color: '#bf1b00',
- fill: 0,
- linewidth: 2,
- stack: false,
- }
- ),
- HostDetailsGraphPanel(
- {},
- 'Network Load',
- "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
- 'null',
- 'decbytes',
- 'Send (-) / Receive (+)',
- |||
- sum by (device) (
- rate(
- node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
- rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]
)
+ |||,
+ 'used'
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ {
+ alias: 'total',
+ color: '#bf1b00',
+ fill: 0,
+ linewidth: 2,
+ stack: false,
+ }
+ ),
+ HostDetailsGraphPanel(
+ {},
+ 'Network Load',
+ "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
+ 'null',
+ 'decbytes',
+ 'Send (-) / Receive (+)',
+ |||
+ sum by (device) (
+ rate(
+ node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
+ rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]
)
- |||,
- '{{device}}.rx',
- 15,
- 1,
- 6,
- 10
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- sum by (device) (
- rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
- rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval])
- )
- |||,
- '{{device}}.tx'
- ),
- ]
- )
- .addSeriesOverride(
- { alias: '/.*tx/', transform: 'negative-Y' }
- ),
- HostDetailsGraphPanel(
- {},
- 'Network drop rate',
- '',
- 'null',
- 'pps',
- 'Send (-) / Receive (+)',
- |||
- rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
- |||,
- '{{device}}.rx',
- 21,
- 1,
- 3,
- 5
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
- |||,
- '{{device}}.tx'
- ),
- ]
- )
- .addSeriesOverride(
- {
- alias: '/.*tx/',
- transform: 'negative-Y',
- }
- ),
- HostDetailsSingleStatPanel(
- 'bytes',
- 'Raw Capacity',
- 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.',
- 'current',
- |||
- sum(
- ceph_osd_stat_bytes{%(matchers)s} and
- on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}
- )
- ||| % u.matchers(),
- 0,
- 6,
- 3,
- 5
- ),
- HostDetailsGraphPanel(
- {},
- 'Network error rate',
- '',
- 'null',
- 'pps',
- 'Send (-) / Receive (+)',
- |||
- rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
- |||,
- '{{device}}.rx',
- 21,
- 6,
- 3,
- 5
- )
- .addTargets(
- [u.addTargetSchema(
+ )
+ |||,
+ '{{device}}.rx',
+ 15,
+ 1,
+ 6,
+ 10
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum by (device) (
+ rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
+ rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval])
+ )
+ |||,
+ '{{device}}.tx'
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ { alias: '/.*tx/', transform: 'negative-Y' }
+ ),
+ HostDetailsGraphPanel(
+ {},
+ 'Network drop rate',
+ '',
+ 'null',
+ 'pps',
+ 'Send (-) / Receive (+)',
+ |||
+ rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ |||,
+ '{{device}}.rx',
+ 21,
+ 1,
+ 3,
+ 5
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
|||
- rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
|||,
'{{device}}.tx'
- )]
- )
- .addSeriesOverride(
- {
- alias: '/.*tx/',
- transform: 'negative-Y',
- }
- ),
- u.addRowSchema(false,
- true,
- 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } },
- HostDetailsGraphPanel(
- {},
- '$ceph_hosts Disk IOPS',
- "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
- 'connected',
- 'ops',
- 'Read (-) / Write (+)',
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ {
+ alias: '/.*tx/',
+ transform: 'negative-Y',
+ }
+ ),
+ HostDetailsSingleStatPanel(
+ 'bytes',
+ 'Raw Capacity',
+ 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.',
+ 'current',
+ |||
+ sum(
+ ceph_osd_stat_bytes{%(matchers)s} and
+ on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}
+ )
+ ||| % $.matchers(),
+ 0,
+ 6,
+ 3,
+ 5
+ ),
+ HostDetailsGraphPanel(
+ {},
+ 'Network error rate',
+ '',
+ 'null',
+ 'pps',
+ 'Send (-) / Receive (+)',
+ |||
+ rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ |||,
+ '{{device}}.rx',
+ 21,
+ 6,
+ 3,
+ 5
+ )
+ .addTargets(
+ [$.addTargetSchema(
|||
+ rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
+ |||,
+ '{{device}}.tx'
+ )]
+ )
+ .addSeriesOverride(
+ {
+ alias: '/.*tx/',
+ transform: 'negative-Y',
+ }
+ ),
+ $.addRowSchema(false,
+ true,
+ 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } },
+ HostDetailsGraphPanel(
+ {},
+ '$ceph_hosts Disk IOPS',
+ "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
+ 'connected',
+ 'ops',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ (
+ rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
label_replace(
- (
- rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
- ), "instance", "$1", "instance", "([^:.]*).*"
- ) * on(instance, device) group_left(ceph_daemon) label_replace(
+ ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) writes',
+ 0,
+ 12,
+ 11,
+ 9
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
label_replace(
- ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- '{{device}}({{ceph_daemon}}) writes',
- 0,
- 12,
- 11,
- 9
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
+ (
+ rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
label_replace(
- (
- rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
- ), "instance", "$1", "instance", "([^:.]*).*"
- ) * on(instance, device) group_left(ceph_daemon) label_replace(
- label_replace(
- ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- '{{device}}({{ceph_daemon}}) reads'
- ),
- ]
- )
- .addSeriesOverride(
- { alias: '/.*reads/', transform: 'negative-Y' }
- ),
- HostDetailsGraphPanel(
- {},
- '$ceph_hosts Throughput by Disk',
- 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id',
- 'connected',
- 'Bps',
- 'Read (-) / Write (+)',
+ ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) reads'
+ ),
+ ]
+ )
+ .addSeriesOverride(
+ { alias: '/.*reads/', transform: 'negative-Y' }
+ ),
+ HostDetailsGraphPanel(
+ {},
+ '$ceph_hosts Throughput by Disk',
+ 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id',
+ 'connected',
+ 'Bps',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ (
+ rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
+ group_left(ceph_daemon) label_replace(
+ label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
+ "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) write',
+ 12,
+ 12,
+ 11,
+ 9
+ )
+ .addTargets(
+ [$.addTargetSchema(
|||
label_replace(
(
- rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
- ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
+ rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
+ rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
+ ),
+ "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
group_left(ceph_daemon) label_replace(
label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
"instance", "$1", "instance", "([^:.]*).*"
)
- ||| % u.matchers(),
- '{{device}}({{ceph_daemon}}) write',
- 12,
- 12,
- 11,
- 9
- )
- .addTargets(
- [u.addTargetSchema(
- |||
- label_replace(
- (
- rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
- rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
- ),
- "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
- group_left(ceph_daemon) label_replace(
- label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
- "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- '{{device}}({{ceph_daemon}}) read'
- )]
- )
- .addSeriesOverride(
- { alias: '/.*read/', transform: 'negative-Y' }
- ),
- HostDetailsGraphPanel(
- {},
- '$ceph_hosts Disk Latency',
- "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
- 'null as zero',
- 's',
- '',
- |||
- max by(instance, device) (label_replace(
- (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
- clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or
- (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
- clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001),
- "instance", "$1", "instance", "([^:.]*).*"
- )) * on(instance, device) group_left(ceph_daemon) label_replace(
- label_replace(
- ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"},
- "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- '{{device}}({{ceph_daemon}})',
- 0,
- 21,
- 11,
- 9
- ),
- HostDetailsGraphPanel(
- {},
- '$ceph_hosts Disk utilization',
- 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.',
- 'connected',
- 'percent',
- '%Util',
- |||
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}}) read'
+ )]
+ )
+ .addSeriesOverride(
+ { alias: '/.*read/', transform: 'negative-Y' }
+ ),
+ HostDetailsGraphPanel(
+ {},
+ '$ceph_hosts Disk Latency',
+ "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
+ 'null as zero',
+ 's',
+ '',
+ |||
+ max by(instance, device) (label_replace(
+ (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
+ clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or
+ (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
+ clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001),
+ "instance", "$1", "instance", "([^:.]*).*"
+ )) * on(instance, device) group_left(ceph_daemon) label_replace(
label_replace(
- (
- (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or
- rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100
- ), "instance", "$1", "instance", "([^:.]*).*"
- ) * on(instance, device) group_left(ceph_daemon) label_replace(
- label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"},
- "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- '{{device}}({{ceph_daemon}})',
- 12,
- 21,
- 11,
- 9
- ),
- ]),
- },
+ ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}})',
+ 0,
+ 21,
+ 11,
+ 9
+ ),
+ HostDetailsGraphPanel(
+ {},
+ '$ceph_hosts Disk utilization',
+ 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.',
+ 'connected',
+ 'percent',
+ '%Util',
+ |||
+ label_replace(
+ (
+ (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or
+ rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ ) * on(instance, device) group_left(ceph_daemon) label_replace(
+ label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"},
+ "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}}({{ceph_daemon}})',
+ 12,
+ 21,
+ 11,
+ 9
+ ),
+ ]),
}
local g = import 'grafonnet/grafana.libsonnet';
-local u = import 'utils.libsonnet';
-local c = (import '../mixin.libsonnet')._config;
-{
- grafanaDashboards+:: {
- 'osds-overview.json':
- local OsdOverviewStyle(alias, pattern, type, unit) =
- u.addStyle(alias, null, [
- 'rgba(245, 54, 54, 0.9)',
- 'rgba(237, 129, 40, 0.89)',
- 'rgba(50, 172, 45, 0.97)',
- ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []);
- local OsdOverviewGraphPanel(alias,
- title,
- description,
- formatY1,
- labelY1,
- min,
- expr,
- legendFormat1,
- x,
- y,
- w,
- h) =
- u.graphPanelSchema(alias,
- title,
- description,
- 'null',
- false,
- formatY1,
- 'short',
- labelY1,
- null,
- min,
- 1,
- '$datasource')
- .addTargets(
- [u.addTargetSchema(expr, legendFormat1)]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
- local OsdOverviewPieChartPanel(alias, description, title) =
- u.addPieChartSchema(alias,
- '$datasource',
- description,
- 'Under graph',
- 'pie',
- title,
- 'current');
- local OsdOverviewSingleStatPanel(colors,
- format,
- title,
- description,
- valueName,
- colorValue,
- gaugeMaxValue,
- gaugeShow,
- sparkLineShow,
- thresholds,
- expr,
- x,
- y,
- w,
- h) =
- u.addSingleStatSchema(
- colors,
- '$datasource',
- format,
- title,
- description,
- valueName,
- colorValue,
- gaugeMaxValue,
- gaugeShow,
- sparkLineShow,
- thresholds
- )
- .addTarget(
- u.addTargetSchema(expr)
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
-
- u.dashboardSchema(
- 'OSD Overview',
- '',
- 'lo02I1Aiz',
- 'now-1h',
- '10s',
- 16,
- c.dashboardTags,
- '',
- {
- refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.0.0'
+(import 'utils.libsonnet') {
+ 'osds-overview.json':
+ local OsdOverviewStyle(alias, pattern, type, unit) =
+ $.addStyle(alias, null, [
+ 'rgba(245, 54, 54, 0.9)',
+ 'rgba(237, 129, 40, 0.89)',
+ 'rgba(50, 172, 45, 0.97)',
+ ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []);
+ local OsdOverviewGraphPanel(alias,
+ title,
+ description,
+ formatY1,
+ labelY1,
+ min,
+ expr,
+ legendFormat1,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema(alias,
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ min,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr, legendFormat1)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+ local OsdOverviewPieChartPanel(alias, description, title) =
+ $.addPieChartSchema(alias,
+ '$datasource',
+ description,
+ 'Under graph',
+ 'pie',
+ title,
+ 'current');
+ local OsdOverviewSingleStatPanel(colors,
+ format,
+ title,
+ description,
+ valueName,
+ colorValue,
+ gaugeMaxValue,
+ gaugeShow,
+ sparkLineShow,
+ thresholds,
+ expr,
+ x,
+ y,
+ w,
+ h) =
+ $.addSingleStatSchema(
+ colors,
+ '$datasource',
+ format,
+ title,
+ description,
+ valueName,
+ colorValue,
+ gaugeMaxValue,
+ gaugeShow,
+ sparkLineShow,
+ thresholds
)
- .addRequired(
- type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addRequired(
- type='panel', id='table', name='Table', version='5.0.0'
- )
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
+ .addTarget(
+ $.addTargetSchema(expr)
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'OSD Overview',
+ '',
+ 'lo02I1Aiz',
+ 'now-1h',
+ '10s',
+ 16,
+ $._config.dashboardTags,
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
)
- .addTemplate(
- u.addJobTemplate()
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='table', name='Table', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addPanels([
+ OsdOverviewGraphPanel(
+ { '@95%ile': '#e0752d' },
+ 'OSD Read Latencies',
+ '',
+ 'ms',
+ null,
+ '0',
+ |||
+ avg (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
+ )
+ ||| % $.matchers(),
+ 'AVG read',
+ 0,
+ 0,
+ 8,
+ 8
)
- .addPanels([
- OsdOverviewGraphPanel(
- { '@95%ile': '#e0752d' },
- 'OSD Read Latencies',
- '',
- 'ms',
- null,
- '0',
- |||
- avg (
- rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ max(
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
- )
- ||| % u.matchers(),
- 'AVG read',
- 0,
- 0,
- 8,
- 8
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- max(
+ )
+ ||| % $.matchers(),
+ 'MAX read'
+ ),
+ $.addTargetSchema(
+ |||
+ quantile(0.95,
+ (
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
+ * 1000
)
- ||| % u.matchers(),
- 'MAX read'
- ),
- u.addTargetSchema(
- |||
- quantile(0.95,
- (
- rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
- * 1000
- )
- )
- ||| % u.matchers(),
- '@95%ile'
- ),
- ],
- ),
- u.addTableSchema(
- '$datasource',
- "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
- { col: 2, desc: true },
- [
- OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
- OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
- OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
- ],
- 'Highest READ Latencies',
- 'table'
- )
- .addTarget(
- u.addTargetSchema(
- |||
- topk(10,
- (sort(
- (
- rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
- 1000
- )
- ))
)
- ||| % u.matchers(),
- '',
- 'table',
- 1,
- true
- )
- ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
- OsdOverviewGraphPanel(
- {
- '@95%ile write': '#e0752d',
- },
- 'OSD Write Latencies',
- '',
- 'ms',
- null,
- '0',
+ ||| % $.matchers(),
+ '@95%ile'
+ ),
+ ],
+ ),
+ $.addTableSchema(
+ '$datasource',
+ "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
+ { col: 2, desc: true },
+ [
+ OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
+ OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
+ OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest READ Latencies',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
|||
- avg(
- rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
- * 1000
- )
- ||| % u.matchers(),
- 'AVG write',
- 12,
- 0,
- 8,
- 8
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- max(
- rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ topk(10,
+ (sort(
+ (
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
1000
)
- ||| % u.matchers(), 'MAX write'
- ),
- u.addTargetSchema(
- |||
- quantile(0.95, (
- rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
- 1000
- ))
- ||| % u.matchers(), '@95%ile write'
- ),
- ],
- ),
- u.addTableSchema(
- '$datasource',
- "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
- { col: 2, desc: true },
- [
- OsdOverviewStyle(
- 'OSD ID', 'ceph_daemon', 'string', 'short'
- ),
- OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
- OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
- ],
- 'Highest WRITE Latencies',
- 'table'
+ ))
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
)
- .addTarget(
- u.addTargetSchema(
+ ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
+ OsdOverviewGraphPanel(
+ {
+ '@95%ile write': '#e0752d',
+ },
+ 'OSD Write Latencies',
+ '',
+ 'ms',
+ null,
+ '0',
+ |||
+ avg(
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
+ * 1000
+ )
+ ||| % $.matchers(),
+ 'AVG write',
+ 12,
+ 0,
+ 8,
+ 8
+ )
+ .addTargets(
+ [
+ $.addTargetSchema(
|||
- topk(10,
- (sort(
- (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
- 1000)
- ))
+ max(
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
)
- ||| % u.matchers(),
- '',
- 'table',
- 1,
- true
- )
- ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
- OsdOverviewPieChartPanel(
- {}, '', 'OSD Types Summary'
- )
- .addTarget(
- u.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % u.matchers(), '{{device_class}}')
- ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
- OsdOverviewPieChartPanel(
- { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
- )
- .addTarget(
- u.addTargetSchema(
- 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % u.matchers(), 'bluestore', 'time_series', 2
- )
- )
- .addTarget(
- u.addTargetSchema(
- 'absent(ceph_bluefs_wal_total_bytes%(matchers)s) * count(ceph_osd_metadata{%(matchers)s})' % u.matchers(), 'filestore', 'time_series', 2
- )
- ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
- OsdOverviewPieChartPanel(
- {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
- )
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % u.matchers(), '<1TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % u.matchers(), '<2TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % u.matchers(), '<3TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % u.matchers(), '<4TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % u.matchers(), '<6TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % u.matchers(), '<8TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % u.matchers(), '<10TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % u.matchers(), '<12TB', 'time_series', 2
- ))
- .addTarget(u.addTargetSchema(
- 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % u.matchers(), '<12TB+', 'time_series', 2
- )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
- g.graphPanel.new(bars=true,
- datasource='$datasource',
- title='Distribution of PGs per OSD',
- x_axis_buckets=20,
- x_axis_mode='histogram',
- x_axis_values=['total'],
- formatY1='short',
- formatY2='short',
- labelY1='# of OSDs',
- min='0',
- nullPointMode='null')
- .addTarget(u.addTargetSchema(
- 'ceph_osd_numpg{%(matchers)s}' % u.matchers(), 'PGs per OSD', 'time_series', 1, true
- )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
- OsdOverviewSingleStatPanel(
- ['#d44a3a', '#299c46'],
- 'percentunit',
- 'OSD onode Hits Ratio',
- 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
- 'current',
- true,
- 1,
- true,
- false,
- '.75',
+ ||| % $.matchers(), 'MAX write'
+ ),
+ $.addTargetSchema(
+ |||
+ quantile(0.95, (
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000
+ ))
+ ||| % $.matchers(), '@95%ile write'
+ ),
+ ],
+ ),
+ $.addTableSchema(
+ '$datasource',
+ "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
+ { col: 2, desc: true },
+ [
+ OsdOverviewStyle(
+ 'OSD ID', 'ceph_daemon', 'string', 'short'
+ ),
+ OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
+ OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest WRITE Latencies',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
|||
- sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
- sum(ceph_bluestore_onode_hits{%(matchers)s}) +
- sum(ceph_bluestore_onode_misses{%(matchers)s})
+ topk(10,
+ (sort(
+ (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
+ 1000)
+ ))
)
- ||| % u.matchers(),
- 20,
- 8,
- 4,
- 8
- ),
- u.addRowSchema(false,
- true,
- 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
- OsdOverviewGraphPanel(
- {},
- 'Read/Write Profile',
- 'Show the read/write workload profile overtime',
- 'short',
- null,
- null,
- 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % u.matchers(),
- 'Reads',
- 0,
- 17,
- 24,
- 8
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
)
- .addTargets([u.addTargetSchema(
- 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % u.matchers(), 'Writes'
- )]),
- ]),
- 'osd-device-details.json':
- local OsdDeviceDetailsPanel(title,
- description,
- formatY1,
- labelY1,
- expr1,
- expr2,
- legendFormat1,
- legendFormat2,
- x,
- y,
- w,
- h) =
- u.graphPanelSchema({},
- title,
- description,
- 'null',
- false,
- formatY1,
- 'short',
- labelY1,
- null,
- null,
- 1,
- '$datasource')
- .addTargets(
- [
- u.addTargetSchema(expr1,
- legendFormat1),
- u.addTargetSchema(expr2, legendFormat2),
- ]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
-
- u.dashboardSchema(
- 'OSD device details',
- '',
- 'CrAHE0iZz',
- 'now-3h',
- '',
- 16,
- c.dashboardTags,
- '',
- {
- refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
+ ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
+ OsdOverviewPieChartPanel(
+ {}, '', 'OSD Types Summary'
)
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
+ .addTarget(
+ $.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
+ ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
+ OsdOverviewPieChartPanel(
+ { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
)
)
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ .addTarget(
+ $.addTargetSchema(
+ 'absent(ceph_bluefs_wal_total_bytes%(matchers)s) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
+ )
+ ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
+ OsdOverviewPieChartPanel(
+ {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
)
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
+ ))
+ .addTarget($.addTargetSchema(
+ 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
+ )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
+ g.graphPanel.new(bars=true,
+ datasource='$datasource',
+ title='Distribution of PGs per OSD',
+ x_axis_buckets=20,
+ x_axis_mode='histogram',
+ x_axis_values=['total'],
+ formatY1='short',
+ formatY2='short',
+ labelY1='# of OSDs',
+ min='0',
+ nullPointMode='null')
+ .addTarget($.addTargetSchema(
+ 'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
+ )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
+ OsdOverviewSingleStatPanel(
+ ['#d44a3a', '#299c46'],
+ 'percentunit',
+ 'OSD onode Hits Ratio',
+ 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
+ 'current',
+ true,
+ 1,
+ true,
+ false,
+ '.75',
+ |||
+ sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
+ sum(ceph_bluestore_onode_hits{%(matchers)s}) +
+ sum(ceph_bluestore_onode_misses{%(matchers)s})
+ )
+ ||| % $.matchers(),
+ 20,
+ 8,
+ 4,
+ 8
+ ),
+ $.addRowSchema(false,
+ true,
+ 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
+ OsdOverviewGraphPanel(
+ {},
+ 'Read/Write Profile',
+ 'Show the read/write workload profile overtime',
+ 'short',
+ null,
+ null,
+ 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'Reads',
+ 0,
+ 17,
+ 24,
+ 8
)
- .addTemplate(
- g.template.datasource('datasource',
- 'prometheus',
- 'default',
- label='Data Source')
+ .addTargets([$.addTargetSchema(
+ 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
+ )]),
+ ]),
+ 'osd-device-details.json':
+ local OsdDeviceDetailsPanel(title,
+ description,
+ formatY1,
+ labelY1,
+ expr1,
+ expr2,
+ legendFormat1,
+ legendFormat2,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema({},
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ null,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(expr1,
+ legendFormat1),
+ $.addTargetSchema(expr2, legendFormat2),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'OSD device details',
+ '',
+ 'CrAHE0iZz',
+ 'now-3h',
+ '',
+ 16,
+ $._config.dashboardTags,
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
)
- .addTemplate(
- u.addClusterTemplate()
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('osd',
+ '$datasource',
+ 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ false,
+ 1,
+ 'OSD',
+ '(.*)')
+ )
+ .addPanels([
+ $.addRowSchema(
+ false, true, 'OSD Performance'
+ ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ OsdDeviceDetailsPanel(
+ '$osd Latency',
+ '',
+ 's',
+ 'Read (-) / Write (+)',
+ |||
+ rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ |||
+ rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
+ on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ 'read',
+ 'write',
+ 0,
+ 1,
+ 6,
+ 9
)
- .addTemplate(
- u.addJobTemplate()
+ .addSeriesOverride(
+ {
+ alias: 'read',
+ transform: 'negative-Y',
+ }
+ ),
+ OsdDeviceDetailsPanel(
+ '$osd R/W IOPS',
+ '',
+ 'short',
+ 'Read (-) / Write (+)',
+ 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'Reads',
+ 'Writes',
+ 6,
+ 1,
+ 6,
+ 9
)
- .addTemplate(
- u.addTemplateSchema('osd',
- '$datasource',
- 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- false,
- 1,
- 'OSD',
- '(.*)')
+ .addSeriesOverride(
+ { alias: 'Reads', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ '$osd R/W Bytes',
+ '',
+ 'bytes',
+ 'Read (-) / Write (+)',
+ 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
+ 'Read Bytes',
+ 'Write Bytes',
+ 12,
+ 1,
+ 6,
+ 9
)
- .addPanels([
- u.addRowSchema(
- false, true, 'OSD Performance'
- ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
- OsdDeviceDetailsPanel(
- '$osd Latency',
- '',
- 's',
- 'Read (-) / Write (+)',
- |||
- rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
- ||| % u.matchers(),
- |||
- rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
- on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
- ||| % u.matchers(),
- 'read',
- 'write',
- 0,
- 1,
- 6,
- 9
- )
- .addSeriesOverride(
- {
- alias: 'read',
- transform: 'negative-Y',
- }
- ),
- OsdDeviceDetailsPanel(
- '$osd R/W IOPS',
- '',
- 'short',
- 'Read (-) / Write (+)',
- 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(),
- 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(),
- 'Reads',
- 'Writes',
- 6,
- 1,
- 6,
- 9
- )
- .addSeriesOverride(
- { alias: 'Reads', transform: 'negative-Y' }
- ),
- OsdDeviceDetailsPanel(
- '$osd R/W Bytes',
- '',
- 'bytes',
- 'Read (-) / Write (+)',
- 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(),
- 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(),
- 'Read Bytes',
- 'Write Bytes',
- 12,
- 1,
- 6,
- 9
- )
- .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
- u.addRowSchema(
- false, true, 'Physical Device Performance'
- ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
- OsdDeviceDetailsPanel(
- 'Physical Device Latency for $osd',
- '',
- 's',
- 'Read (-) / Write (+)',
- |||
- (
- label_replace(
- rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
- rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
- "instance", "$1", "instance", "([^:.]*).*"
- ) and on (instance, device) label_replace(
- label_replace(
- ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
- "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- )
- ||| % u.matchers(),
- |||
- (
- label_replace(
- rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
- rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
- "instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
- label_replace(
- label_replace(
- ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- )
- ||| % u.matchers(),
- '{{instance}}/{{device}} Reads',
- '{{instance}}/{{device}} Writes',
- 0,
- 11,
- 6,
- 9
- )
- .addSeriesOverride(
- { alias: '/.*Reads/', transform: 'negative-Y' }
- ),
- OsdDeviceDetailsPanel(
- 'Physical Device R/W IOPS for $osd',
- '',
- 'short',
- 'Read (-) / Write (+)',
- |||
+ .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
+ $.addRowSchema(
+ false, true, 'Physical Device Performance'
+ ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
+ OsdDeviceDetailsPanel(
+ 'Physical Device Latency for $osd',
+ '',
+ 's',
+ 'Read (-) / Write (+)',
+ |||
+ (
label_replace(
- rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
+ rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
- ||| % u.matchers(),
- |||
+ )
+ ||| % $.matchers(),
+ |||
+ (
label_replace(
- rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
- "instance", "$1", "instance", "([^:.]*).*"
- ) and on (instance, device) label_replace(
+ rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
+ rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
label_replace(
- ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
- "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
)
- ||| % u.matchers(),
- '{{device}} on {{instance}} Writes',
- '{{device}} on {{instance}} Reads',
- 6,
- 11,
- 6,
- 9
- )
- .addSeriesOverride(
- { alias: '/.*Reads/', transform: 'negative-Y' }
- ),
- OsdDeviceDetailsPanel(
- 'Physical Device R/W Bytes for $osd',
- '',
- 'Bps',
- 'Read (-) / Write (+)',
- |||
+ ||| % $.matchers(),
+ '{{instance}}/{{device}} Reads',
+ '{{instance}}/{{device}} Writes',
+ 0,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ 'Physical Device R/W IOPS for $osd',
+ '',
+ 'short',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
label_replace(
- rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
- ) and on (instance, device) label_replace(
- label_replace(
- ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
- "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- |||
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ |||
+ label_replace(
+ rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
label_replace(
- rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
- ) and on (instance, device) label_replace(
- label_replace(
- ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
- "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- '{{instance}} {{device}} Reads',
- '{{instance}} {{device}} Writes',
- 12,
- 11,
- 6,
- 9
- )
- .addSeriesOverride(
- { alias: '/.*Reads/', transform: 'negative-Y' }
- ),
- u.graphPanelSchema(
- {},
- 'Physical Device Util% for $osd',
- '',
- 'null',
- false,
- 'percentunit',
- 'short',
- null,
- null,
- null,
- 1,
- '$datasource'
- )
- .addTarget(u.addTargetSchema(
- |||
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}} on {{instance}} Writes',
+ '{{device}} on {{instance}} Reads',
+ 6,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ OsdDeviceDetailsPanel(
+ 'Physical Device R/W Bytes for $osd',
+ '',
+ 'Bps',
+ 'Read (-) / Write (+)',
+ |||
+ label_replace(
+ rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
label_replace(
- rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
- "instance", "$1", "instance", "([^:.]*).*"
- ) and on (instance, device) label_replace(
- label_replace(
- ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
- ), "instance", "$1", "instance", "([^:.]*).*"
- )
- ||| % u.matchers(),
- '{{device}} on {{instance}}'
- )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
- ]),
- },
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ |||
+ label_replace(
+ rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
+ "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{instance}} {{device}} Reads',
+ '{{instance}} {{device}} Writes',
+ 12,
+ 11,
+ 6,
+ 9
+ )
+ .addSeriesOverride(
+ { alias: '/.*Reads/', transform: 'negative-Y' }
+ ),
+ $.graphPanelSchema(
+ {},
+ 'Physical Device Util% for $osd',
+ '',
+ 'null',
+ false,
+ 'percentunit',
+ 'short',
+ null,
+ null,
+ null,
+ 1,
+ '$datasource'
+ )
+ .addTarget($.addTargetSchema(
+ |||
+ label_replace(
+ rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
+ "instance", "$1", "instance", "([^:.]*).*"
+ ) and on (instance, device) label_replace(
+ label_replace(
+ ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
+ ), "instance", "$1", "instance", "([^:.]*).*"
+ )
+ ||| % $.matchers(),
+ '{{device}} on {{instance}}'
+ )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
+ ]),
}
local g = import 'grafonnet/grafana.libsonnet';
-local u = import 'utils.libsonnet';
-local c = (import '../mixin.libsonnet')._config;
-{
- grafanaDashboards+:: {
- 'pool-overview.json':
- local PoolOverviewSingleStatPanel(format,
- title,
- description,
- valueName,
- expr,
- instant,
- targetFormat,
- x,
- y,
- w,
- h) =
- u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
- '$datasource',
- format,
- title,
- description,
- valueName,
- false,
- 100,
- false,
- false,
- '')
- .addTarget(u.addTargetSchema(expr, '', targetFormat, 1, instant)) + { gridPos: { x: x, y: y, w: w, h: h } };
-
- local PoolOverviewStyle(alias,
- pattern,
- type,
- unit,
- colorMode,
- thresholds,
- valueMaps) =
- u.addStyle(alias,
- colorMode,
- [
- 'rgba(245, 54, 54, 0.9)',
- 'rgba(237, 129, 40, 0.89)',
- 'rgba(50, 172, 45, 0.97)',
- ],
- 'YYYY-MM-DD HH:mm:ss',
- 2,
- 1,
- pattern,
- thresholds,
- type,
- unit,
- valueMaps);
-
- local PoolOverviewGraphPanel(title,
- description,
- formatY1,
- labelY1,
- expr,
- legendFormat,
- x,
- y,
- w,
- h) =
- u.graphPanelSchema({},
- title,
- description,
- 'null as zero',
- false,
- formatY1,
- 'short',
- labelY1,
- null,
- 0,
- 1,
- '$datasource')
- .addTargets(
- [u.addTargetSchema(expr,
- legendFormat)]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
-
- u.dashboardSchema(
- 'Ceph Pools Overview',
- '',
- 'z99hzWtmk',
- 'now-1h',
- '15s',
- 22,
- c.dashboardTags,
- '',
- {
- refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
- )
- .addTemplate(
- g.template.custom(label='TopK',
- name='topk',
- current='15',
- query='15')
- )
- .addPanels([
- PoolOverviewSingleStatPanel(
- 'none',
- 'Pools',
- '',
- 'avg',
- 'count(ceph_pool_metadata{%(matchers)s})' % u.matchers(),
- true,
- 'table',
- 0,
- 0,
- 3,
- 3
- ),
- PoolOverviewSingleStatPanel(
- 'none',
- 'Pools with Compression',
- 'Count of the pools that have compression enabled',
- 'current',
- 'count(ceph_pool_metadata{%(matchers)s, compression_mode!="none"})' % u.matchers(),
- null,
- '',
- 3,
- 0,
- 3,
- 3
- ),
- PoolOverviewSingleStatPanel(
- 'bytes',
- 'Total Raw Capacity',
- 'Total raw capacity available to the cluster',
- 'current',
- 'sum(ceph_osd_stat_bytes{%(matchers)s})' % u.matchers(),
- null,
- '',
- 6,
- 0,
- 3,
- 3
- ),
- PoolOverviewSingleStatPanel(
- 'bytes',
- 'Raw Capacity Consumed',
- 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)',
- 'current',
- 'sum(ceph_pool_bytes_used{%(matchers)s})' % u.matchers(),
- true,
- '',
- 9,
- 0,
- 3,
- 3
- ),
- PoolOverviewSingleStatPanel(
- 'bytes',
- 'Logical Stored ',
- 'Total of client data stored in the cluster',
- 'current',
- 'sum(ceph_pool_stored{%(matchers)s})' % u.matchers(),
- true,
- '',
- 12,
- 0,
- 3,
- 3
- ),
- PoolOverviewSingleStatPanel(
- 'bytes',
- 'Compression Savings',
- 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression',
- 'current',
- |||
- sum(
- ceph_pool_compress_under_bytes{%(matchers)s} -
- ceph_pool_compress_bytes_used{%(matchers)s}
- )
- ||| % u.matchers(),
- null,
- '',
- 15,
- 0,
- 3,
- 3
- ),
- PoolOverviewSingleStatPanel(
- 'percent',
- 'Compression Eligibility',
- 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data',
- 'current',
- |||
- (
- sum(ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
- sum(ceph_pool_stored_raw{%(matchers)s} and ceph_pool_compress_under_bytes{%(matchers)s} > 0)
- ) * 100
- ||| % u.matchers(),
- null,
- 'table',
- 18,
- 0,
- 3,
- 3
- ),
- PoolOverviewSingleStatPanel(
- 'none',
- 'Compression Factor',
- 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)',
- 'current',
- |||
- sum(
- ceph_pool_compress_under_bytes{%(matchers)s} > 0)
- / sum(ceph_pool_compress_bytes_used{%(matchers)s} > 0
- )
- ||| % u.matchers(),
- null,
- '',
- 21,
- 0,
- 3,
- 3
- ),
- u.addTableSchema(
- '$datasource',
- '',
- { col: 5, desc: true },
- [
- PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []),
- PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []),
- PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []),
- PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []),
- PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []),
- PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []),
- PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70', '85'], []),
- PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []),
- PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []),
- PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []),
- PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []),
- PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []),
- PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []),
- PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []),
- PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []),
- PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []),
- PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []),
- PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []),
- PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []),
- PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{ text: 'ON', value: '1' }]),
- ],
- 'Pool Overview',
- 'table'
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- (
- ceph_pool_compress_under_bytes{%(matchers)s} /
- ceph_pool_compress_bytes_used{%(matchers)s} > 0
- ) and on(pool_id) (
- (
- (ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
- ceph_pool_stored_raw{%(matchers)s}
- ) * 100 > 0.5
- )
- ||| % u.matchers(),
- 'A',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- |||
- ceph_pool_max_avail{%(matchers)s} *
- on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s}
- ||| % u.matchers(),
- 'B',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- |||
- (
- (ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
- ceph_pool_stored_raw{%(matchers)s}
- ) * 100
- ||| % u.matchers(),
- 'C',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- |||
- ceph_pool_percent_used{%(matchers)s} *
- on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s}
- ||| % u.matchers(),
- 'D',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- |||
- ceph_pool_compress_under_bytes{%(matchers)s} -
- ceph_pool_compress_bytes_used{%(matchers)s} > 0
- ||| % u.matchers(),
- 'E',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- 'delta(ceph_pool_stored{%(matchers)s}[5d])' % u.matchers(), 'F', 'table', 1, true
- ),
- u.addTargetSchema(
- |||
- rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])
- + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])
- ||| % u.matchers(),
- 'G',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- |||
- rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
- rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval])
- ||| % u.matchers(),
- 'H',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- 'ceph_pool_metadata{%(matchers)s}' % u.matchers(), 'I', 'table', 1, true
- ),
- u.addTargetSchema(
- 'ceph_pool_stored{%(matchers)s} * on(pool_id) group_left ceph_pool_metadata{%(matchers)s}' % u.matchers(),
- 'J',
- 'table',
- 1,
- true
- ),
- u.addTargetSchema(
- 'ceph_pool_metadata{%(matchers)s, compression_mode!="none"}' % u.matchers(), 'K', 'table', 1, true
- ),
- u.addTargetSchema('', 'L', '', '', null),
- ]
- ) + { gridPos: { x: 0, y: 3, w: 24, h: 6 } },
- PoolOverviewGraphPanel(
- 'Top $topk Client IOPS by Pool',
- 'This chart shows the sum of read and write IOPS from all clients by pool',
- 'short',
- 'IOPS',
- |||
- topk($topk,
- round(
- (
- rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) +
- rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])
- ), 1
- ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s})
- ||| % u.matchers(),
- '{{name}} ',
- 0,
- 9,
- 12,
- 8
- )
- .addTarget(
- u.addTargetSchema(
- |||
- topk($topk,
- rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) +
- on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s}
- )
- ||| % u.matchers(),
- '{{name}} - write'
- )
- ),
- PoolOverviewGraphPanel(
- 'Top $topk Client Bandwidth by Pool',
- 'The chart shows the sum of read and write bytes from all clients, by pool',
- 'Bps',
- 'Throughput',
- |||
- topk($topk,
- (
- rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
- rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval])
- ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s}
- )
- ||| % u.matchers(),
- '{{name}}',
- 12,
- 9,
- 12,
- 8
- ),
- PoolOverviewGraphPanel(
- 'Pool Capacity Usage (RAW)',
- 'Historical view of capacity usage, to help identify growth and trends in pool consumption',
- 'bytes',
- 'Capacity Used',
- 'ceph_pool_bytes_used{%(matchers)s} * on(pool_id) group_right ceph_pool_metadata{%(matchers)s}' % u.matchers(),
- '{{name}}',
- 0,
- 17,
- 24,
- 7
- ),
- ]),
- 'pool-detail.json':
- local PoolDetailSingleStatPanel(format,
+(import 'utils.libsonnet') {
+ 'pool-overview.json':
+ local PoolOverviewSingleStatPanel(format,
title,
description,
valueName,
- colorValue,
- gaugeMaxValue,
- gaugeShow,
- sparkLineShow,
- thresholds,
expr,
+ instant,
targetFormat,
x,
y,
w,
h) =
- u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
- '$datasource',
- format,
- title,
- description,
- valueName,
- colorValue,
- gaugeMaxValue,
- gaugeShow,
- sparkLineShow,
- thresholds)
- .addTarget(u.addTargetSchema(expr, '', targetFormat)) + { gridPos: { x: x, y: y, w: w, h: h } };
+ $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
+ '$datasource',
+ format,
+ title,
+ description,
+ valueName,
+ false,
+ 100,
+ false,
+ false,
+ '')
+ .addTarget($.addTargetSchema(expr, '', targetFormat, 1, instant)) + { gridPos: { x: x, y: y, w: w, h: h } };
- local PoolDetailGraphPanel(alias,
- title,
+ local PoolOverviewStyle(alias,
+ pattern,
+ type,
+ unit,
+ colorMode,
+ thresholds,
+ valueMaps) =
+ $.addStyle(alias,
+ colorMode,
+ [
+ 'rgba(245, 54, 54, 0.9)',
+ 'rgba(237, 129, 40, 0.89)',
+ 'rgba(50, 172, 45, 0.97)',
+ ],
+ 'YYYY-MM-DD HH:mm:ss',
+ 2,
+ 1,
+ pattern,
+ thresholds,
+ type,
+ unit,
+ valueMaps);
+
+ local PoolOverviewGraphPanel(title,
description,
formatY1,
labelY1,
y,
w,
h) =
- u.graphPanelSchema(alias,
- title,
- description,
- 'null as zero',
- false,
- formatY1,
- 'short',
- labelY1,
- null,
- null,
- 1,
- '$datasource')
- .addTargets(
- [u.addTargetSchema(expr, legendFormat)]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+ $.graphPanelSchema({},
+ title,
+ description,
+ 'null as zero',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr,
+ legendFormat)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'Ceph Pool Details',
+ $.dashboardSchema(
+ 'Ceph Pools Overview',
+ '',
+ 'z99hzWtmk',
+ 'now-1h',
+ '15s',
+ 22,
+ $._config.dashboardTags,
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ g.template.custom(label='TopK',
+ name='topk',
+ current='15',
+ query='15')
+ )
+ .addPanels([
+ PoolOverviewSingleStatPanel(
+ 'none',
+ 'Pools',
'',
- '-xyV8KCiz',
- 'now-1h',
- '15s',
- 22,
- c.dashboardTags,
+ 'avg',
+ 'count(ceph_pool_metadata{%(matchers)s})' % $.matchers(),
+ true,
+ 'table',
+ 0,
+ 0,
+ 3,
+ 3
+ ),
+ PoolOverviewSingleStatPanel(
+ 'none',
+ 'Pools with Compression',
+ 'Count of the pools that have compression enabled',
+ 'current',
+ 'count(ceph_pool_metadata{%(matchers)s, compression_mode!="none"})' % $.matchers(),
+ null,
'',
- {
- refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.3.2'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addRequired(
- type='panel', id='singlestat', name='Singlestat', version='5.0.0'
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
+ 3,
+ 0,
+ 3,
+ 3
+ ),
+ PoolOverviewSingleStatPanel(
+ 'bytes',
+ 'Total Raw Capacity',
+ 'Total raw capacity available to the cluster',
+ 'current',
+ 'sum(ceph_osd_stat_bytes{%(matchers)s})' % $.matchers(),
+ null,
+ '',
+ 6,
+ 0,
+ 3,
+ 3
+ ),
+ PoolOverviewSingleStatPanel(
+ 'bytes',
+ 'Raw Capacity Consumed',
+ 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)',
+ 'current',
+ 'sum(ceph_pool_bytes_used{%(matchers)s})' % $.matchers(),
+ true,
+ '',
+ 9,
+ 0,
+ 3,
+ 3
+ ),
+ PoolOverviewSingleStatPanel(
+ 'bytes',
+ 'Logical Stored ',
+ 'Total of client data stored in the cluster',
+ 'current',
+ 'sum(ceph_pool_stored{%(matchers)s})' % $.matchers(),
+ true,
+ '',
+ 12,
+ 0,
+ 3,
+ 3
+ ),
+ PoolOverviewSingleStatPanel(
+ 'bytes',
+ 'Compression Savings',
+ 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression',
+ 'current',
+ |||
+ sum(
+ ceph_pool_compress_under_bytes{%(matchers)s} -
+ ceph_pool_compress_bytes_used{%(matchers)s}
+ )
+ ||| % $.matchers(),
+ null,
+ '',
+ 15,
+ 0,
+ 3,
+ 3
+ ),
+ PoolOverviewSingleStatPanel(
+ 'percent',
+ 'Compression Eligibility',
+ 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data',
+ 'current',
+ |||
+ (
+ sum(ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
+ sum(ceph_pool_stored_raw{%(matchers)s} and ceph_pool_compress_under_bytes{%(matchers)s} > 0)
+ ) * 100
+ ||| % $.matchers(),
+ null,
+ 'table',
+ 18,
+ 0,
+ 3,
+ 3
+ ),
+ PoolOverviewSingleStatPanel(
+ 'none',
+ 'Compression Factor',
+ 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)',
+ 'current',
+ |||
+ sum(
+ ceph_pool_compress_under_bytes{%(matchers)s} > 0)
+ / sum(ceph_pool_compress_bytes_used{%(matchers)s} > 0
+ )
+ ||| % $.matchers(),
+ null,
+ '',
+ 21,
+ 0,
+ 3,
+ 3
+ ),
+ $.addTableSchema(
+ '$datasource',
+ '',
+ { col: 5, desc: true },
+ [
+ PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []),
+ PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []),
+ PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []),
+ PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []),
+ PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []),
+ PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []),
+ PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70', '85'], []),
+ PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []),
+ PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []),
+ PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []),
+ PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []),
+ PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []),
+ PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []),
+ PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []),
+ PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []),
+ PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []),
+ PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []),
+ PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []),
+ PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []),
+ PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{ text: 'ON', value: '1' }]),
+ ],
+ 'Pool Overview',
+ 'table'
)
- .addTemplate(
- u.addTemplateSchema('pool_name',
- '$datasource',
- 'label_values(ceph_pool_metadata{%(matchers)s}, name)' % u.matchers(),
- 1,
- false,
- 1,
- 'Pool Name',
- '')
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ (
+ ceph_pool_compress_under_bytes{%(matchers)s} /
+ ceph_pool_compress_bytes_used{%(matchers)s} > 0
+ ) and on(pool_id) (
+ (
+ (ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
+ ceph_pool_stored_raw{%(matchers)s}
+ ) * 100 > 0.5
+ )
+ ||| % $.matchers(),
+ 'A',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ ceph_pool_max_avail{%(matchers)s} *
+ on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s}
+ ||| % $.matchers(),
+ 'B',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ (
+ (ceph_pool_compress_under_bytes{%(matchers)s} > 0) /
+ ceph_pool_stored_raw{%(matchers)s}
+ ) * 100
+ ||| % $.matchers(),
+ 'C',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ ceph_pool_percent_used{%(matchers)s} *
+ on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s}
+ ||| % $.matchers(),
+ 'D',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ ceph_pool_compress_under_bytes{%(matchers)s} -
+ ceph_pool_compress_bytes_used{%(matchers)s} > 0
+ ||| % $.matchers(),
+ 'E',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ 'delta(ceph_pool_stored{%(matchers)s}[5d])' % $.matchers(), 'F', 'table', 1, true
+ ),
+ $.addTargetSchema(
+ |||
+ rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])
+ + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ 'G',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ |||
+ rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval])
+ ||| % $.matchers(),
+ 'H',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ 'ceph_pool_metadata{%(matchers)s}' % $.matchers(), 'I', 'table', 1, true
+ ),
+ $.addTargetSchema(
+ 'ceph_pool_stored{%(matchers)s} * on(pool_id) group_left ceph_pool_metadata{%(matchers)s}' % $.matchers(),
+ 'J',
+ 'table',
+ 1,
+ true
+ ),
+ $.addTargetSchema(
+ 'ceph_pool_metadata{%(matchers)s, compression_mode!="none"}' % $.matchers(), 'K', 'table', 1, true
+ ),
+ $.addTargetSchema('', 'L', '', '', null),
+ ]
+ ) + { gridPos: { x: 0, y: 3, w: 24, h: 6 } },
+ PoolOverviewGraphPanel(
+ 'Top $topk Client IOPS by Pool',
+ 'This chart shows the sum of read and write IOPS from all clients by pool',
+ 'short',
+ 'IOPS',
+ |||
+ topk($topk,
+ round(
+ (
+ rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])
+ ), 1
+ ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s})
+ ||| % $.matchers(),
+ '{{name}} ',
+ 0,
+ 9,
+ 12,
+ 8
)
- .addPanels([
- PoolDetailSingleStatPanel(
- 'percentunit',
- 'Capacity used',
- '',
- 'current',
- true,
- 1,
- true,
- true,
- '.7,.8',
- |||
- (ceph_pool_stored{%(matchers)s} / (ceph_pool_stored{%(matchers)s} + ceph_pool_max_avail{%(matchers)s})) *
- on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
- ||| % u.matchers(),
- 'time_series',
- 0,
- 0,
- 7,
- 7
- ),
- PoolDetailSingleStatPanel(
- 's',
- 'Time till full',
- 'Time till pool is full assuming the average fill rate of the last 4 hours',
- false,
- 100,
- false,
- false,
- '',
- 'current',
+ .addTarget(
+ $.addTargetSchema(
|||
- (ceph_pool_max_avail{%(matchers)s} / deriv(ceph_pool_stored{%(matchers)s}[6h])) *
- on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} > 0
- ||| % u.matchers(),
- 'time_series',
- 7,
- 0,
- 5,
- 7
- ),
- PoolDetailGraphPanel(
- {
- read_op_per_sec:
- '#3F6833',
- write_op_per_sec: '#E5AC0E',
- },
- '$pool_name Object Ingress/Egress',
- '',
- 'ops',
- 'Objects out(-) / in(+) ',
- |||
- deriv(ceph_pool_objects{%(matchers)s}[1m]) *
- on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
- ||| % u.matchers(),
- 'Objects per second',
- 12,
- 0,
- 12,
- 7
- ),
- PoolDetailGraphPanel(
- {
- read_op_per_sec: '#3F6833',
- write_op_per_sec: '#E5AC0E',
- },
- '$pool_name Client IOPS',
- '',
- 'iops',
- 'Read (-) / Write (+)',
- |||
- rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) *
- on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
- ||| % u.matchers(),
- 'reads',
- 0,
- 7,
- 12,
- 7
+ topk($topk,
+ rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) +
+ on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s}
+ )
+ ||| % $.matchers(),
+ '{{name}} - write'
)
- .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' })
- .addTarget(
- u.addTargetSchema(
- |||
- rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) *
- on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
- ||| % u.matchers(),
- 'writes'
+ ),
+ PoolOverviewGraphPanel(
+ 'Top $topk Client Bandwidth by Pool',
+ 'The chart shows the sum of read and write bytes from all clients, by pool',
+ 'Bps',
+ 'Throughput',
+ |||
+ topk($topk,
+ (
+ rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval])
+ ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s}
)
- ),
- PoolDetailGraphPanel(
- {
- read_op_per_sec: '#3F6833',
- write_op_per_sec: '#E5AC0E',
- },
- '$pool_name Client Throughput',
- '',
- 'Bps',
- 'Read (-) / Write (+)',
+ ||| % $.matchers(),
+ '{{name}}',
+ 12,
+ 9,
+ 12,
+ 8
+ ),
+ PoolOverviewGraphPanel(
+ 'Pool Capacity Usage (RAW)',
+ 'Historical view of capacity usage, to help identify growth and trends in pool consumption',
+ 'bytes',
+ 'Capacity Used',
+ 'ceph_pool_bytes_used{%(matchers)s} * on(pool_id) group_right ceph_pool_metadata{%(matchers)s}' % $.matchers(),
+ '{{name}}',
+ 0,
+ 17,
+ 24,
+ 7
+ ),
+ ]),
+ 'pool-detail.json':
+ local PoolDetailSingleStatPanel(format,
+ title,
+ description,
+ valueName,
+ colorValue,
+ gaugeMaxValue,
+ gaugeShow,
+ sparkLineShow,
+ thresholds,
+ expr,
+ targetFormat,
+ x,
+ y,
+ w,
+ h) =
+ $.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
+ '$datasource',
+ format,
+ title,
+ description,
+ valueName,
+ colorValue,
+ gaugeMaxValue,
+ gaugeShow,
+ sparkLineShow,
+ thresholds)
+ .addTarget($.addTargetSchema(expr, '', targetFormat)) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ local PoolDetailGraphPanel(alias,
+ title,
+ description,
+ formatY1,
+ labelY1,
+ expr,
+ legendFormat,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema(alias,
+ title,
+ description,
+ 'null as zero',
+ false,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ null,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr, legendFormat)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
+
+ $.dashboardSchema(
+ 'Ceph Pool Details',
+ '',
+ '-xyV8KCiz',
+ 'now-1h',
+ '15s',
+ 22,
+ $._config.dashboardTags,
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='singlestat', name='Singlestat', version='5.0.0'
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('pool_name',
+ '$datasource',
+ 'label_values(ceph_pool_metadata{%(matchers)s}, name)' % $.matchers(),
+ 1,
+ false,
+ 1,
+ 'Pool Name',
+ '')
+ )
+ .addPanels([
+ PoolDetailSingleStatPanel(
+ 'percentunit',
+ 'Capacity used',
+ '',
+ 'current',
+ true,
+ 1,
+ true,
+ true,
+ '.7,.8',
+ |||
+ (ceph_pool_stored{%(matchers)s} / (ceph_pool_stored{%(matchers)s} + ceph_pool_max_avail{%(matchers)s})) *
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'time_series',
+ 0,
+ 0,
+ 7,
+ 7
+ ),
+ PoolDetailSingleStatPanel(
+ 's',
+ 'Time till full',
+ 'Time till pool is full assuming the average fill rate of the last 4 hours',
+ false,
+ 100,
+ false,
+ false,
+ '',
+ 'current',
+ |||
+ (ceph_pool_max_avail{%(matchers)s} / deriv(ceph_pool_stored{%(matchers)s}[6h])) *
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} > 0
+ ||| % $.matchers(),
+ 'time_series',
+ 7,
+ 0,
+ 5,
+ 7
+ ),
+ PoolDetailGraphPanel(
+ {
+ read_op_per_sec:
+ '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Object Ingress/Egress',
+ '',
+ 'ops',
+ 'Objects out(-) / in(+) ',
+ |||
+ deriv(ceph_pool_objects{%(matchers)s}[1m]) *
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'Objects per second',
+ 12,
+ 0,
+ 12,
+ 7
+ ),
+ PoolDetailGraphPanel(
+ {
+ read_op_per_sec: '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Client IOPS',
+ '',
+ 'iops',
+ 'Read (-) / Write (+)',
+ |||
+ rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) *
+ on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'reads',
+ 0,
+ 7,
+ 12,
+ 7
+ )
+ .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' })
+ .addTarget(
+ $.addTargetSchema(
|||
- rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) *
on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
- ||| % u.matchers(),
- 'reads',
- 12,
- 7,
- 12,
- 7
+ ||| % $.matchers(),
+ 'writes'
)
- .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' })
- .addTarget(
- u.addTargetSchema(
- |||
- rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) +
- on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
- ||| % u.matchers(),
- 'writes'
- )
- ),
- PoolDetailGraphPanel(
- {
- read_op_per_sec: '#3F6833',
- write_op_per_sec: '#E5AC0E',
- },
- '$pool_name Objects',
- '',
- 'short',
- 'Objects',
+ ),
+ PoolDetailGraphPanel(
+ {
+ read_op_per_sec: '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Client Throughput',
+ '',
+ 'Bps',
+ 'Read (-) / Write (+)',
+ |||
+ rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) +
+ on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'reads',
+ 12,
+ 7,
+ 12,
+ 7
+ )
+ .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' })
+ .addTarget(
+ $.addTargetSchema(
|||
- ceph_pool_objects{%(matchers)s} *
+ rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) +
on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
- ||| % u.matchers(),
- 'Number of Objects',
- 0,
- 14,
- 12,
- 7
- ),
- ]),
- },
+ ||| % $.matchers(),
+ 'writes'
+ )
+ ),
+ PoolDetailGraphPanel(
+ {
+ read_op_per_sec: '#3F6833',
+ write_op_per_sec: '#E5AC0E',
+ },
+ '$pool_name Objects',
+ '',
+ 'short',
+ 'Objects',
+ |||
+ ceph_pool_objects{%(matchers)s} *
+ on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"}
+ ||| % $.matchers(),
+ 'Number of Objects',
+ 0,
+ 14,
+ 12,
+ 7
+ ),
+ ]),
}
local u = import 'utils.libsonnet';
local c = (import '../mixin.libsonnet')._config;
-{
- grafanaDashboards+:: {
- 'rbd-details.json':
- local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) =
- u.graphPanelSchema({},
- title,
- '',
- 'null as zero',
- false,
- formatY1,
- formatY1,
- null,
- null,
- 0,
- 1,
- '$datasource')
- .addTargets(
- [
- u.addTargetSchema(expr1,
- '{{pool}} Write'),
- u.addTargetSchema(expr2, '{{pool}} Read'),
- ]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+(import 'utils.libsonnet') {
+ 'rbd-details.json':
+ local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) =
+ $.graphPanelSchema({},
+ title,
+ '',
+ 'null as zero',
+ false,
+ formatY1,
+ formatY1,
+ null,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(expr1,
+ '{{pool}} Write'),
+ $.addTargetSchema(expr2, '{{pool}} Read'),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'RBD Details',
- 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)',
- 'YhCYGcuZz',
- 'now-1h',
- false,
- 16,
- c.dashboardTags,
- '',
- {
- refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.3.3'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
- )
- .addTemplate(
- u.addTemplateSchema('pool',
- '$datasource',
- 'label_values(pool)',
- 1,
- false,
- 0,
- '',
- '')
- )
- .addTemplate(
- u.addTemplateSchema('image',
- '$datasource',
- 'label_values(image)',
- 1,
- false,
- 0,
- '',
- '')
+ $.dashboardSchema(
+ 'RBD Details',
+ 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)',
+ 'YhCYGcuZz',
+ 'now-1h',
+ false,
+ 16,
+ $._config.dashboardTags,
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
)
- .addPanels([
- RbdDetailsPanel(
- 'IOPS',
- 'iops',
- 'rate(ceph_rbd_write_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers()
- ,
- 'rate(ceph_rbd_read_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(),
- 0,
- 0,
- 8,
- 9
- ),
- RbdDetailsPanel(
- 'Throughput',
- 'Bps',
- 'rate(ceph_rbd_write_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(),
- 'rate(ceph_rbd_read_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(),
- 8,
- 0,
- 8,
- 9
- ),
- RbdDetailsPanel(
- 'Average Latency',
- 'ns',
- |||
- rate(ceph_rbd_write_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) /
- rate(ceph_rbd_write_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])
- ||| % u.matchers(),
- |||
- rate(ceph_rbd_read_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) /
- rate(ceph_rbd_read_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])
- ||| % u.matchers(),
- 16,
- 0,
- 8,
- 9
- ),
- ]),
- 'rbd-overview.json':
- local RgwOverviewStyle(alias, pattern, type, unit) =
- u.addStyle(alias,
- null,
- ['rgba(245, 54, 54, 0.9)', 'rgba(237, 129, 40, 0.89)', 'rgba(50, 172, 45, 0.97)'],
- 'YYYY-MM-DD HH:mm:ss',
- 2,
- 1,
- pattern,
- [],
- type,
- unit,
- []);
- local RbdOverviewPanel(title,
- formatY1,
- expr1,
- expr2,
- legendFormat1,
- legendFormat2,
- x,
- y,
- w,
- h) =
- u.graphPanelSchema({},
- title,
- '',
- 'null',
- false,
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.3.3'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('pool',
+ '$datasource',
+ 'label_values(pool)',
+ 1,
+ false,
+ 0,
+ '',
+ '')
+ )
+ .addTemplate(
+ $.addTemplateSchema('image',
+ '$datasource',
+ 'label_values(image)',
+ 1,
+ false,
+ 0,
+ '',
+ '')
+ )
+ .addPanels([
+ RbdDetailsPanel(
+ 'IOPS',
+ 'iops',
+ 'rate(ceph_rbd_write_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers()
+ ,
+ 'rate(ceph_rbd_read_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(),
+ 0,
+ 0,
+ 8,
+ 9
+ ),
+ RbdDetailsPanel(
+ 'Throughput',
+ 'Bps',
+ 'rate(ceph_rbd_write_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(),
+ 'rate(ceph_rbd_read_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % $.matchers(),
+ 8,
+ 0,
+ 8,
+ 9
+ ),
+ RbdDetailsPanel(
+ 'Average Latency',
+ 'ns',
+ |||
+ rate(ceph_rbd_write_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) /
+ rate(ceph_rbd_write_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])
+ ||| % $.matchers(),
+ |||
+ rate(ceph_rbd_read_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) /
+ rate(ceph_rbd_read_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])
+ ||| % $.matchers(),
+ 16,
+ 0,
+ 8,
+ 9
+ ),
+ ]),
+ 'rbd-overview.json':
+ local RgwOverviewStyle(alias, pattern, type, unit) =
+ $.addStyle(alias,
+ null,
+ ['rgba(245, 54, 54, 0.9)', 'rgba(237, 129, 40, 0.89)', 'rgba(50, 172, 45, 0.97)'],
+ 'YYYY-MM-DD HH:mm:ss',
+ 2,
+ 1,
+ pattern,
+ [],
+ type,
+ unit,
+ []);
+ local RbdOverviewPanel(title,
formatY1,
- 'short',
- null,
- null,
- 0,
- 1,
- '$datasource')
- .addTargets(
- [
- u.addTargetSchema(expr1,
- legendFormat1),
- u.addTargetSchema(expr2,
- legendFormat2),
- ]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+ expr1,
+ expr2,
+ legendFormat1,
+ legendFormat2,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema({},
+ title,
+ '',
+ 'null',
+ false,
+ formatY1,
+ 'short',
+ null,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(expr1,
+ legendFormat1),
+ $.addTargetSchema(expr2,
+ legendFormat2),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'RBD Overview',
- '',
- '41FrpeUiz',
- 'now-1h',
- '30s',
+ $.dashboardSchema(
+ 'RBD Overview',
+ '',
+ '41FrpeUiz',
+ 'now-1h',
+ '30s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.4.2'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addRequired(
+ type='datasource', id='prometheus', name='Prometheus', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='table', name='Table', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addPanels([
+ RbdOverviewPanel(
+ 'IOPS',
+ 'short',
+ 'round(sum(rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'round(sum(rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'Writes',
+ 'Reads',
+ 0,
+ 0,
+ 8,
+ 7
+ ),
+ RbdOverviewPanel(
+ 'Throughput',
+ 'Bps',
+ 'round(sum(rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'round(sum(rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval])))' % $.matchers(),
+ 'Write',
+ 'Read',
+ 8,
+ 0,
+ 8,
+ 7
+ ),
+ RbdOverviewPanel(
+ 'Average Latency',
+ 'ns',
+ |||
+ round(
+ sum(rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval])) /
+ sum(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]))
+ )
+ ||| % $.matchers(),
+ |||
+ round(
+ sum(rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval])) /
+ sum(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]))
+ )
+ ||| % $.matchers(),
+ 'Write',
+ 'Read',
16,
- c.dashboardTags + ['overview'],
+ 0,
+ 8,
+ 7
+ ),
+ $.addTableSchema(
+ '$datasource',
'',
- {
- refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.4.2'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addRequired(
- type='datasource', id='prometheus', name='Prometheus', version='5.0.0'
+ { col: 3, desc: true },
+ [
+ RgwOverviewStyle('Pool', 'pool', 'string', 'short'),
+ RgwOverviewStyle('Image', 'image', 'string', 'short'),
+ RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'),
+ RgwOverviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest IOPS',
+ 'table'
)
- .addRequired(
- type='panel', id='table', name='Table', version='5.0.0'
- )
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
- )
- .addPanels([
- RbdOverviewPanel(
- 'IOPS',
- 'short',
- 'round(sum(rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval])))' % u.matchers(),
- 'round(sum(rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])))' % u.matchers(),
- 'Writes',
- 'Reads',
- 0,
- 0,
- 8,
- 7
- ),
- RbdOverviewPanel(
- 'Throughput',
- 'Bps',
- 'round(sum(rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])))' % u.matchers(),
- 'round(sum(rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval])))' % u.matchers(),
- 'Write',
- 'Read',
- 8,
- 0,
- 8,
- 7
- ),
- RbdOverviewPanel(
- 'Average Latency',
- 'ns',
+ .addTarget(
+ $.addTargetSchema(
|||
- round(
- sum(rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval])) /
- sum(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]))
- )
- ||| % u.matchers(),
- |||
- round(
- sum(rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval])) /
- sum(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]))
+ topk(10,
+ (
+ sort((
+ rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval]) +
+ on (image, pool, namespace) rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])
+ ))
+ )
)
- ||| % u.matchers(),
- 'Write',
- 'Read',
- 16,
- 0,
- 8,
- 7
- ),
- u.addTableSchema(
- '$datasource',
+ ||| % $.matchers(),
'',
- { col: 3, desc: true },
- [
- RgwOverviewStyle('Pool', 'pool', 'string', 'short'),
- RgwOverviewStyle('Image', 'image', 'string', 'short'),
- RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'),
- RgwOverviewStyle('', '/.*/', 'hidden', 'short'),
- ],
- 'Highest IOPS',
- 'table'
+ 'table',
+ 1,
+ true
)
- .addTarget(
- u.addTargetSchema(
- |||
- topk(10,
- (
- sort((
- rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval]) +
- on (image, pool, namespace) rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])
- ))
- )
+ ) + { gridPos: { x: 0, y: 7, w: 8, h: 7 } },
+ $.addTableSchema(
+ '$datasource',
+ '',
+ { col: 3, desc: true },
+ [
+ RgwOverviewStyle('Pool', 'pool', 'string', 'short'),
+ RgwOverviewStyle('Image', 'image', 'string', 'short'),
+ RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'),
+ RgwOverviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest Throughput',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ sort(
+ sum(
+ rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])
+ ) by (pool, image, namespace)
)
- ||| % u.matchers(),
- '',
- 'table',
- 1,
- true
- )
- ) + { gridPos: { x: 0, y: 7, w: 8, h: 7 } },
- u.addTableSchema(
- '$datasource',
+ )
+ ||| % $.matchers(),
'',
- { col: 3, desc: true },
- [
- RgwOverviewStyle('Pool', 'pool', 'string', 'short'),
- RgwOverviewStyle('Image', 'image', 'string', 'short'),
- RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'),
- RgwOverviewStyle('', '/.*/', 'hidden', 'short'),
- ],
- 'Highest Throughput',
- 'table'
+ 'table',
+ 1,
+ true
)
- .addTarget(
- u.addTargetSchema(
- |||
- topk(10,
- sort(
- sum(
- rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval]) +
- rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])
- ) by (pool, image, namespace)
- )
- )
- ||| % u.matchers(),
- '',
- 'table',
- 1,
- true
- )
- ) + { gridPos: { x: 8, y: 7, w: 8, h: 7 } },
- u.addTableSchema(
- '$datasource',
+ ) + { gridPos: { x: 8, y: 7, w: 8, h: 7 } },
+ $.addTableSchema(
+ '$datasource',
+ '',
+ { col: 3, desc: true },
+ [
+ RgwOverviewStyle('Pool', 'pool', 'string', 'short'),
+ RgwOverviewStyle('Image', 'image', 'string', 'short'),
+ RgwOverviewStyle('Latency', 'Value', 'number', 'ns'),
+ RgwOverviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Highest Latency',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ sum(
+ rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval]) /
+ clamp_min(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]), 1) +
+ rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval]) /
+ clamp_min(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]), 1)
+ ) by (pool, image, namespace)
+ )
+ ||| % $.matchers(),
'',
- { col: 3, desc: true },
- [
- RgwOverviewStyle('Pool', 'pool', 'string', 'short'),
- RgwOverviewStyle('Image', 'image', 'string', 'short'),
- RgwOverviewStyle('Latency', 'Value', 'number', 'ns'),
- RgwOverviewStyle('', '/.*/', 'hidden', 'short'),
- ],
- 'Highest Latency',
- 'table'
+ 'table',
+ 1,
+ true
)
- .addTarget(
- u.addTargetSchema(
- |||
- topk(10,
- sum(
- rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval]) /
- clamp_min(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]), 1) +
- rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval]) /
- clamp_min(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]), 1)
- ) by (pool, image, namespace)
- )
- ||| % u.matchers(),
- '',
- 'table',
- 1,
- true
- )
- ) + { gridPos: { x: 16, y: 7, w: 8, h: 7 } },
- ]),
- },
+ ) + { gridPos: { x: 16, y: 7, w: 8, h: 7 } },
+ ]),
}
local u = import 'utils.libsonnet';
local c = (import '../mixin.libsonnet')._config;
-{
- grafanaDashboards+:: {
- 'radosgw-sync-overview.json':
- local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
- u.graphPanelSchema({},
- title,
- '',
- 'null as zero',
- true,
- formatY1,
- 'short',
- labelY1,
- null,
- 0,
- 1,
- '$datasource')
- .addTargets(
- [
- u.addTargetSchema(
- 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))'
- % (u.matchers() + { rgwMetric: rgwMetric }),
- '{{source_zone}}'
- ),
- ]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+(import 'utils.libsonnet') {
+ 'radosgw-sync-overview.json':
+ local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
+ $.graphPanelSchema({},
+ title,
+ '',
+ 'null as zero',
+ true,
+ formatY1,
+ 'short',
+ labelY1,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [
+ $.addTargetSchema(
+ 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))'
+ % ($.matchers() + { rgwMetric: rgwMetric }),
+ '{{source_zone}}'
+ ),
+ ]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'RGW Sync Overview',
- '',
- 'rgw-sync-overview',
- 'now-1h',
- '15s',
- 16,
- c.dashboardTags + ['overview'],
- '',
- {
- refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.0.0'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
+ $.dashboardSchema(
+ 'RGW Sync Overview',
+ '',
+ 'rgw-sync-overview',
+ 'now-1h',
+ '15s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
)
- .addTemplate(
- g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
- )
- .addTemplate(
- u.addTemplateSchema(
- 'rgw_servers',
- '$datasource',
- 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- true,
- 1,
- '',
- 'RGW Server'
- )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'rgw_servers',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ '',
+ 'RGW Server'
)
- .addPanels([
- RgwSyncOverviewPanel(
- 'Replication (throughput) from Source Zone',
- 'Bps',
- null,
- 'ceph_data_sync_from_zone_fetch_bytes_sum',
- 0,
- 0,
- 8,
- 7
- ),
- RgwSyncOverviewPanel(
- 'Replication (objects) from Source Zone',
- 'short',
- 'Objects/s',
- 'ceph_data_sync_from_zone_fetch_bytes_count',
- 8,
- 0,
- 8,
- 7
- ),
- RgwSyncOverviewPanel(
- 'Polling Request Latency from Source Zone',
- 'ms',
- null,
- 'ceph_data_sync_from_zone_poll_latency_sum',
- 16,
- 0,
- 8,
- 7
- ),
- RgwSyncOverviewPanel(
- 'Unsuccessful Object Replications from Source Zone',
- 'short',
- 'Count/s',
- 'ceph_data_sync_from_zone_fetch_errors',
- 0,
- 7,
- 8,
- 7
- ),
- ]),
- 'radosgw-overview.json':
- local RgwOverviewPanel(
+ )
+ .addPanels([
+ RgwSyncOverviewPanel(
+ 'Replication (throughput) from Source Zone',
+ 'Bps',
+ null,
+ 'ceph_data_sync_from_zone_fetch_bytes_sum',
+ 0,
+ 0,
+ 8,
+ 7
+ ),
+ RgwSyncOverviewPanel(
+ 'Replication (objects) from Source Zone',
+ 'short',
+ 'Objects/s',
+ 'ceph_data_sync_from_zone_fetch_bytes_count',
+ 8,
+ 0,
+ 8,
+ 7
+ ),
+ RgwSyncOverviewPanel(
+ 'Polling Request Latency from Source Zone',
+ 'ms',
+ null,
+ 'ceph_data_sync_from_zone_poll_latency_sum',
+ 16,
+ 0,
+ 8,
+ 7
+ ),
+ RgwSyncOverviewPanel(
+ 'Unsuccessful Object Replications from Source Zone',
+ 'short',
+ 'Count/s',
+ 'ceph_data_sync_from_zone_fetch_errors',
+ 0,
+ 7,
+ 8,
+ 7
+ ),
+ ]),
+ 'radosgw-overview.json':
+ local RgwOverviewPanel(
+ title,
+ description,
+ formatY1,
+ formatY2,
+ expr1,
+ legendFormat1,
+ x,
+ y,
+ w,
+ h,
+ datasource='$datasource',
+ legend_alignAsTable=false,
+ legend_avg=false,
+ legend_min=false,
+ legend_max=false,
+ legend_current=false,
+ legend_values=false
+ ) =
+ $.graphPanelSchema(
+ {},
title,
description,
+ 'null',
+ false,
formatY1,
formatY2,
- expr1,
- legendFormat1,
- x,
- y,
- w,
- h,
- datasource='$datasource',
- legend_alignAsTable=false,
- legend_avg=false,
- legend_min=false,
- legend_max=false,
- legend_current=false,
- legend_values=false
- ) =
- u.graphPanelSchema(
- {},
- title,
- description,
- 'null',
- false,
- formatY1,
- formatY2,
- null,
- null,
- 0,
- 1,
- datasource,
- legend_alignAsTable,
- legend_avg,
- legend_min,
- legend_max,
- legend_current,
- legend_values
- )
- .addTargets(
- [u.addTargetSchema(expr1, legendFormat1)]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+ null,
+ null,
+ 0,
+ 1,
+ datasource,
+ legend_alignAsTable,
+ legend_avg,
+ legend_min,
+ legend_max,
+ legend_current,
+ legend_values
+ )
+ .addTargets(
+ [$.addTargetSchema(expr1, legendFormat1)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'RGW Overview',
- '',
- 'WAkugZpiz',
- 'now-1h',
- '15s',
- 16,
- c.dashboardTags + ['overview'],
+ $.dashboardSchema(
+ 'RGW Overview',
+ '',
+ 'WAkugZpiz',
+ 'now-1h',
+ '15s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'rgw_servers',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
'',
- {
- refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
+ 'RGW Server'
)
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'code',
+ '$datasource',
+ 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)',
+ 1,
+ true,
+ 1,
+ 'HTTP Code',
+ ''
)
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'job_haproxy',
+ '$datasource',
+ 'label_values(haproxy_server_status, job)',
+ 1,
+ true,
+ 1,
+ 'job haproxy',
+ '(.*)',
+ multi=true,
+ allValues='.+',
+ ),
+ )
+ .addTemplate(
+ $.addTemplateSchema(
+ 'ingress_service',
+ '$datasource',
+ 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)',
+ 1,
+ true,
+ 1,
+ 'Ingress Service',
+ ''
)
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addPanels([
+ $.addRowSchema(false,
+ true,
+ 'RGW Overview - All Gateways') +
+ {
+ gridPos: { x: 0, y: 0, w: 24, h: 1 },
+ },
+ RgwOverviewPanel(
+ 'Average GET/PUT Latencies',
+ '',
+ 's',
+ 'short',
+ |||
+ rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}
+ ||| % $.matchers(),
+ 'GET AVG',
+ 0,
+ 1,
+ 8,
+ 7
+ ).addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}
+ ||| % $.matchers(),
+ 'PUT AVG'
+ ),
+ ]
+ ),
+ RgwOverviewPanel(
+ 'Total Requests/sec by RGW Instance',
+ '',
+ 'none',
+ 'short',
+ |||
+ sum by (rgw_host) (
+ label_replace(
+ rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 8,
+ 1,
+ 7,
+ 7
+ ),
+ RgwOverviewPanel(
+ 'GET Latencies by RGW Instance',
+ 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
+ 's',
+ 'short',
+ |||
+ label_replace(
+ rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 15,
+ 1,
+ 6,
+ 7
+ ),
+ RgwOverviewPanel(
+ 'Bandwidth Consumed by Type',
+ 'Total bytes transferred in/out of all radosgw instances within the cluster',
+ 'bytes',
+ 'short',
+ 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
+ 'GETs',
+ 0,
+ 8,
+ 8,
+ 6
+ ).addTargets(
+ [$.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
+ 'PUTs')]
+ ),
+ RgwOverviewPanel(
+ 'Bandwidth by RGW Instance',
+ 'Total bytes transferred in/out through get/put operations, by radosgw instance',
+ 'bytes',
+ 'short',
+ |||
+ label_replace(sum by (instance_id) (
+ rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 8,
+ 8,
+ 7,
+ 6
+ ),
+ RgwOverviewPanel(
+ 'PUT Latencies by RGW Instance',
+ 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
+ 's',
+ 'short',
+ |||
+ label_replace(
+ rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
+ "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
+ )
+ ||| % $.matchers(),
+ '{{rgw_host}}',
+ 15,
+ 8,
+ 6,
+ 6
+ ),
+ $.addRowSchema(
+ false, true, 'RGW Overview - HAProxy Metrics'
+ ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
+ RgwOverviewPanel(
+ 'Total responses by HTTP code',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval]
+ )
+ ) by (code)
+ |||,
+ 'Frontend {{ code }}',
+ 0,
+ 12,
+ 5,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
)
- .addTemplate(
- g.template.datasource('datasource',
- 'prometheus',
- 'default',
- label='Data Source')
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval]
+ )
+ ) by (code)
+ |||, 'Backend {{ code }}'
+ ),
+ ]
)
- .addTemplate(
- u.addClusterTemplate()
+ .addSeriesOverride([
+ {
+ alias: '/.*Back.*/',
+ transform: 'negative-Y',
+ },
+ { alias: '/.*1.*/' },
+ { alias: '/.*2.*/' },
+ { alias: '/.*3.*/' },
+ { alias: '/.*4.*/' },
+ { alias: '/.*5.*/' },
+ { alias: '/.*other.*/' },
+ ]),
+ RgwOverviewPanel(
+ 'Total requests / responses',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||,
+ 'Requests',
+ 5,
+ 12,
+ 5,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
)
- .addTemplate(
- u.addJobTemplate()
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Response errors', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Requests errors'
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Backend redispatch', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Backend retry', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Request denied', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}
+ ) by (instance)
+ |||, 'Backend Queued', 'time_series', 2
+ ),
+ ]
)
- .addTemplate(
- u.addTemplateSchema(
- 'rgw_servers',
- '$datasource',
- 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- true,
- 1,
- '',
- 'RGW Server'
- )
+ .addSeriesOverride([
+ {
+ alias: '/.*Response.*/',
+ transform: 'negative-Y',
+ },
+ {
+ alias: '/.*Backend.*/',
+ transform: 'negative-Y',
+ },
+ ]),
+ RgwOverviewPanel(
+ 'Total number of connections',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||,
+ 'Front',
+ 10,
+ 12,
+ 5,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
)
- .addTemplate(
- u.addTemplateSchema(
- 'code',
- '$datasource',
- 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)',
- 1,
- true,
- 1,
- 'HTTP Code',
- ''
- )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Back'
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ )
+ ) by (instance)
+ |||, 'Back errors'
+ ),
+ ]
)
- .addTemplate(
- u.addTemplateSchema(
- 'job_haproxy',
- '$datasource',
- 'label_values(haproxy_server_status, job)',
- 1,
- true,
- 1,
- 'job haproxy',
- '(.*)',
- multi=true,
- allValues='.+',
- ),
+ .addSeriesOverride([
+ {
+ alias: '/.*Back.*/',
+ transform: 'negative-Y',
+ },
+ ]),
+ RgwOverviewPanel(
+ 'Current total of incoming / outgoing bytes',
+ '',
+ 'short',
+ 'short',
+ |||
+ sum(
+ rate(
+ haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||,
+ 'IN Front',
+ 15,
+ 12,
+ 6,
+ 12,
+ '$datasource',
+ true,
+ true,
+ true,
+ true,
+ true,
+ true
)
- .addTemplate(
- u.addTemplateSchema(
- 'ingress_service',
- '$datasource',
- 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)',
- 1,
- true,
- 1,
- 'Ingress Service',
- ''
- )
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||, 'OUT Front', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||, 'IN Back', 'time_series', 2
+ ),
+ $.addTargetSchema(
+ |||
+ sum(
+ rate(
+ haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
+ ) * 8
+ ) by (instance)
+ |||, 'OUT Back', 'time_series', 2
+ ),
+ ]
)
- .addPanels([
- u.addRowSchema(false,
- true,
- 'RGW Overview - All Gateways') +
+ .addSeriesOverride([
{
- gridPos: { x: 0, y: 0, w: 24, h: 1 },
+ alias: '/.*OUT.*/',
+ transform: 'negative-Y',
},
- RgwOverviewPanel(
- 'Average GET/PUT Latencies',
- '',
- 's',
- 'short',
- |||
- rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
- rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}
- ||| % u.matchers(),
- 'GET AVG',
- 0,
- 1,
- 8,
- 7
- ).addTargets(
- [
- u.addTargetSchema(
- |||
- rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
- rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}
- ||| % u.matchers(),
- 'PUT AVG'
- ),
- ]
- ),
- RgwOverviewPanel(
- 'Total Requests/sec by RGW Instance',
- '',
- 'none',
- 'short',
- |||
- sum by (rgw_host) (
- label_replace(
- rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
- "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
- )
- )
- ||| % u.matchers(),
- '{{rgw_host}}',
- 8,
- 1,
- 7,
- 7
- ),
- RgwOverviewPanel(
- 'GET Latencies by RGW Instance',
- 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
- 's',
- 'short',
- |||
- label_replace(
- rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
- rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
- "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
- )
- ||| % u.matchers(),
- '{{rgw_host}}',
- 15,
- 1,
- 6,
- 7
- ),
- RgwOverviewPanel(
- 'Bandwidth Consumed by Type',
- 'Total bytes transferred in/out of all radosgw instances within the cluster',
- 'bytes',
- 'short',
- 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % u.matchers(),
- 'GETs',
- 0,
- 8,
- 8,
- 6
- ).addTargets(
- [u.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % u.matchers(),
- 'PUTs')]
- ),
- RgwOverviewPanel(
- 'Bandwidth by RGW Instance',
- 'Total bytes transferred in/out through get/put operations, by radosgw instance',
- 'bytes',
- 'short',
- |||
- label_replace(sum by (instance_id) (
- rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) +
- rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
- "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
- )
- ||| % u.matchers(),
- '{{rgw_host}}',
- 8,
- 8,
- 7,
- 6
- ),
- RgwOverviewPanel(
- 'PUT Latencies by RGW Instance',
- 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
- 's',
- 'short',
- |||
- label_replace(
- rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
- rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
- "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
- )
- ||| % u.matchers(),
- '{{rgw_host}}',
- 15,
- 8,
- 6,
- 6
- ),
- u.addRowSchema(
- false, true, 'RGW Overview - HAProxy Metrics'
- ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
- RgwOverviewPanel(
- 'Total responses by HTTP code',
- '',
- 'short',
- 'short',
- |||
- sum(
- rate(
- haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval]
- )
- ) by (code)
- |||,
- 'Frontend {{ code }}',
- 0,
- 12,
- 5,
- 12,
- '$datasource',
- true,
- true,
- true,
- true,
- true,
- true
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval]
- )
- ) by (code)
- |||, 'Backend {{ code }}'
- ),
- ]
- )
- .addSeriesOverride([
- {
- alias: '/.*Back.*/',
- transform: 'negative-Y',
- },
- { alias: '/.*1.*/' },
- { alias: '/.*2.*/' },
- { alias: '/.*3.*/' },
- { alias: '/.*4.*/' },
- { alias: '/.*5.*/' },
- { alias: '/.*other.*/' },
- ]),
- RgwOverviewPanel(
- 'Total requests / responses',
- '',
- 'short',
- 'short',
- |||
- sum(
- rate(
- haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||,
- 'Requests',
- 5,
- 12,
- 5,
- 12,
- '$datasource',
- true,
- true,
- true,
- true,
- true,
- true
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||, 'Response errors', 'time_series', 2
- ),
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||, 'Requests errors'
- ),
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||, 'Backend redispatch', 'time_series', 2
- ),
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||, 'Backend retry', 'time_series', 2
- ),
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||, 'Request denied', 'time_series', 2
- ),
- u.addTargetSchema(
- |||
- sum(
- haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}
- ) by (instance)
- |||, 'Backend Queued', 'time_series', 2
- ),
- ]
- )
- .addSeriesOverride([
- {
- alias: '/.*Response.*/',
- transform: 'negative-Y',
- },
- {
- alias: '/.*Backend.*/',
- transform: 'negative-Y',
- },
- ]),
- RgwOverviewPanel(
- 'Total number of connections',
- '',
- 'short',
- 'short',
- |||
- sum(
- rate(
- haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||,
- 'Front',
- 10,
- 12,
- 5,
- 12,
- '$datasource',
- true,
- true,
- true,
- true,
- true,
- true
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||, 'Back'
- ),
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- )
- ) by (instance)
- |||, 'Back errors'
- ),
- ]
- )
- .addSeriesOverride([
- {
- alias: '/.*Back.*/',
- transform: 'negative-Y',
- },
- ]),
- RgwOverviewPanel(
- 'Current total of incoming / outgoing bytes',
- '',
- 'short',
- 'short',
- |||
- sum(
- rate(
- haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- ) * 8
- ) by (instance)
- |||,
- 'IN Front',
- 15,
- 12,
- 6,
- 12,
- '$datasource',
- true,
- true,
- true,
- true,
- true,
- true
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- ) * 8
- ) by (instance)
- |||, 'OUT Front', 'time_series', 2
- ),
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- ) * 8
- ) by (instance)
- |||, 'IN Back', 'time_series', 2
- ),
- u.addTargetSchema(
- |||
- sum(
- rate(
- haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
- ) * 8
- ) by (instance)
- |||, 'OUT Back', 'time_series', 2
- ),
- ]
- )
- .addSeriesOverride([
- {
- alias: '/.*OUT.*/',
- transform: 'negative-Y',
- },
- ]),
]),
- 'radosgw-detail.json':
- local RgwDetailsPanel(aliasColors,
- title,
- description,
- formatY1,
- formatY2,
- expr1,
- expr2,
- legendFormat1,
- legendFormat2,
- x,
- y,
- w,
- h) =
- u.graphPanelSchema(aliasColors,
- title,
- description,
- 'null',
- false,
- formatY1,
- formatY2,
- null,
- null,
- 0,
- 1,
- '$datasource')
- .addTargets(
- [u.addTargetSchema(expr1, legendFormat1), u.addTargetSchema(expr2, legendFormat2)]
- ) + { gridPos: { x: x, y: y, w: w, h: h } };
+ ]),
+ 'radosgw-detail.json':
+ local RgwDetailsPanel(aliasColors,
+ title,
+ description,
+ formatY1,
+ formatY2,
+ expr1,
+ expr2,
+ legendFormat1,
+ legendFormat2,
+ x,
+ y,
+ w,
+ h) =
+ $.graphPanelSchema(aliasColors,
+ title,
+ description,
+ 'null',
+ false,
+ formatY1,
+ formatY2,
+ null,
+ null,
+ 0,
+ 1,
+ '$datasource')
+ .addTargets(
+ [$.addTargetSchema(expr1, legendFormat1), $.addTargetSchema(expr2, legendFormat2)]
+ ) + { gridPos: { x: x, y: y, w: w, h: h } };
- u.dashboardSchema(
- 'RGW Instance Detail',
+ $.dashboardSchema(
+ 'RGW Instance Detail',
+ '',
+ 'x5ARzZtmk',
+ 'now-1h',
+ '15s',
+ 16,
+ $._config.dashboardTags + ['overview'],
+ '',
+ {
+ refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
+ time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
+ }
+ )
+ .addAnnotation(
+ $.addAnnotationSchema(
+ 1,
+ '-- Grafana --',
+ true,
+ true,
+ 'rgba(0, 211, 255, 1)',
+ 'Annotations & Alerts',
+ 'dashboard'
+ )
+ )
+ .addRequired(
+ type='grafana', id='grafana', name='Grafana', version='5.0.0'
+ )
+ .addRequired(
+ type='panel',
+ id='grafana-piechart-panel',
+ name='Pie Chart',
+ version='1.3.3'
+ )
+ .addRequired(
+ type='panel', id='graph', name='Graph', version='5.0.0'
+ )
+ .addTemplate(
+ g.template.datasource('datasource',
+ 'prometheus',
+ 'default',
+ label='Data Source')
+ )
+ .addTemplate(
+ $.addClusterTemplate()
+ )
+ .addTemplate(
+ $.addJobTemplate()
+ )
+ .addTemplate(
+ $.addTemplateSchema('rgw_servers',
+ '$datasource',
+ 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
+ 1,
+ true,
+ 1,
+ '',
+ '')
+ )
+ .addPanels([
+ $.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
+ RgwDetailsPanel(
+ {},
+ '$rgw_servers GET/PUT Latencies',
'',
- 'x5ARzZtmk',
- 'now-1h',
- '15s',
- 16,
- c.dashboardTags + ['overview'],
+ 's',
+ 'short',
+ |||
+ sum by (instance_id) (
+ rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval])
+ ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ |||
+ sum by (instance_id) (
+ rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
+ rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval])
+ ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'GET {{ceph_daemon}}',
+ 'PUT {{ceph_daemon}}',
+ 0,
+ 1,
+ 6,
+ 8
+ ),
+ RgwDetailsPanel(
+ {},
+ 'Bandwidth by HTTP Operation',
'',
+ 'bytes',
+ 'short',
+ |||
+ rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ |||
+ rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon)
+ ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'GETs {{ceph_daemon}}',
+ 'PUTs {{ceph_daemon}}',
+ 6,
+ 1,
+ 7,
+ 8
+ ),
+ RgwDetailsPanel(
{
- refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
- time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
- }
- )
- .addAnnotation(
- u.addAnnotationSchema(
- 1,
- '-- Grafana --',
- true,
- true,
- 'rgba(0, 211, 255, 1)',
- 'Annotations & Alerts',
- 'dashboard'
- )
- )
- .addRequired(
- type='grafana', id='grafana', name='Grafana', version='5.0.0'
- )
- .addRequired(
- type='panel',
- id='grafana-piechart-panel',
- name='Pie Chart',
- version='1.3.3'
- )
- .addRequired(
- type='panel', id='graph', name='Graph', version='5.0.0'
- )
- .addTemplate(
- g.template.datasource('datasource',
- 'prometheus',
- 'default',
- label='Data Source')
- )
- .addTemplate(
- u.addClusterTemplate()
- )
- .addTemplate(
- u.addJobTemplate()
+ GETs: '#7eb26d',
+ Other: '#447ebc',
+ PUTs: '#eab839',
+ Requests: '#3f2b5b',
+ 'Requests Failed': '#bf1b00',
+ },
+ 'HTTP Request Breakdown',
+ '',
+ 'short',
+ 'short',
+ |||
+ rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ |||
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Requests Failed {{ceph_daemon}}',
+ 'GETs {{ceph_daemon}}',
+ 13,
+ 1,
+ 7,
+ 8
)
- .addTemplate(
- u.addTemplateSchema('rgw_servers',
- '$datasource',
- 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(),
- 1,
- true,
- 1,
- '',
- '')
+ .addTargets(
+ [
+ $.addTargetSchema(
+ |||
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'PUTs {{ceph_daemon}}'
+ ),
+ $.addTargetSchema(
+ |||
+ (
+ rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
+ (
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
+ )
+ ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Other {{ceph_daemon}}'
+ ),
+ ]
+ ),
+ $.addPieChartSchema(
+ {
+ GETs: '#7eb26d',
+ 'Other (HEAD,POST,DELETE)': '#447ebc',
+ PUTs: '#eab839',
+ Requests: '#3f2b5b',
+ Failures: '#bf1b00',
+ }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current'
)
- .addPanels([
- u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
- RgwDetailsPanel(
- {},
- '$rgw_servers GET/PUT Latencies',
- '',
- 's',
- 'short',
- |||
- sum by (instance_id) (
- rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
- rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval])
- ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- |||
- sum by (instance_id) (
- rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
- rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval])
- ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'GET {{ceph_daemon}}',
- 'PUT {{ceph_daemon}}',
- 0,
- 1,
- 6,
- 8
- ),
- RgwDetailsPanel(
- {},
- 'Bandwidth by HTTP Operation',
- '',
- 'bytes',
- 'short',
- |||
- rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- |||
- rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon)
- ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'GETs {{ceph_daemon}}',
- 'PUTs {{ceph_daemon}}',
- 6,
- 1,
- 7,
- 8
- ),
- RgwDetailsPanel(
- {
- GETs: '#7eb26d',
- Other: '#447ebc',
- PUTs: '#eab839',
- Requests: '#3f2b5b',
- 'Requests Failed': '#bf1b00',
- },
- 'HTTP Request Breakdown',
- '',
- 'short',
- 'short',
- |||
- rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- |||
- rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'Requests Failed {{ceph_daemon}}',
- 'GETs {{ceph_daemon}}',
- 13,
- 1,
- 7,
- 8
- )
- .addTargets(
- [
- u.addTargetSchema(
- |||
- rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'PUTs {{ceph_daemon}}'
- ),
- u.addTargetSchema(
- |||
- (
- rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
- (
- rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
- rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
- )
- ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'Other {{ceph_daemon}}'
- ),
- ]
- ),
- u.addPieChartSchema(
- {
- GETs: '#7eb26d',
- 'Other (HEAD,POST,DELETE)': '#447ebc',
- PUTs: '#eab839',
- Requests: '#3f2b5b',
- Failures: '#bf1b00',
- }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current'
- )
- .addTarget(u.addTargetSchema(
- |||
- rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'Failures {{ceph_daemon}}'
- ))
- .addTarget(u.addTargetSchema(
- |||
- rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'GETs {{ceph_daemon}}'
- ))
- .addTarget(u.addTargetSchema(
- |||
- rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
- on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'PUTs {{ceph_daemon}}'
- ))
- .addTarget(u.addTargetSchema(
- |||
- (
- rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
- (
- rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
- rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
- )
- ) * on (instance_id) group_left (ceph_daemon)
- ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
- ||| % u.matchers(),
- 'Other (DELETE,LIST) {{ceph_daemon}}'
- )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
- ]),
- },
+ .addTarget($.addTargetSchema(
+ |||
+ rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Failures {{ceph_daemon}}'
+ ))
+ .addTarget($.addTargetSchema(
+ |||
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'GETs {{ceph_daemon}}'
+ ))
+ .addTarget($.addTargetSchema(
+ |||
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
+ on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'PUTs {{ceph_daemon}}'
+ ))
+ .addTarget($.addTargetSchema(
+ |||
+ (
+ rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
+ (
+ rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
+ rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
+ )
+ ) * on (instance_id) group_left (ceph_daemon)
+ ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
+ ||| % $.matchers(),
+ 'Other (DELETE,LIST) {{ceph_daemon}}'
+ )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
+ ]),
}
local g = import 'grafonnet/grafana.libsonnet';
-local c = (import '../mixin.libsonnet')._config;
{
+ _config:: error 'must provide _config',
+
dashboardSchema(title,
description,
uid,
matchers()::
local jobMatcher = 'job=~"$job"';
- local clusterMatcher = '%s=~"$cluster"' % c.clusterLabel;
+ local clusterMatcher = '%s=~"$cluster"' % $._config.clusterLabel;
{
// Common labels
jobMatcher: jobMatcher,
1,
'cluster',
'(.*)',
- if !c.showMultiCluster then 'variable' else '',
+ if !$._config.showMultiCluster then 'variable' else '',
multi=true,
allValues='.+',
),
(import 'config.libsonnet') +
-(import 'dashboards/dashboards.libsonnet') +
+(import 'dashboards.libsonnet') +
(import 'alerts.libsonnet')