From 94be469d39ee517ad6ab90f5f364981fab4c10fa Mon Sep 17 00:00:00 2001 From: Arthur Outhenin-Chalandre Date: Thu, 12 May 2022 17:24:35 +0200 Subject: [PATCH] ceph-mixin: fix linting issue and add cluster template support Fix most of the issues reported by dashboards-linter: - Add matcher/template for job (and also cluster) - use $__rate_interval everywhere Also this change all the irate functions to rate as most of irate where not actually used correctly. While using irate on graph for instance you can easily miss some of the metrics values as irate only take the two last values and the query steps can be quite large if you want a graph for a few hours/a day or more. Fixes: https://tracker.ceph.com/issues/55003 Signed-off-by: Arthur Outhenin-Chalandre ceph-mixin: add config with matchers and tags Signed-off-by: Arthur Outhenin-Chalandre (cherry picked from commit faeea8d165342245929ea26441ee0cbb8957e3a7) --- monitoring/ceph-mixin/config.libsonnet | 9 +- .../ceph-mixin/dashboards/cephfs.libsonnet | 17 +- .../dashboards/dashboards.libsonnet | 1 + .../ceph-mixin/dashboards/host.libsonnet | 187 ++++++------ .../ceph-mixin/dashboards/osd.libsonnet | 170 ++++++----- .../ceph-mixin/dashboards/pool.libsonnet | 187 +++++++----- .../ceph-mixin/dashboards/rbd.libsonnet | 94 +++--- .../ceph-mixin/dashboards/rgw.libsonnet | 280 +++++++++++------- .../ceph-mixin/dashboards/utils.libsonnet | 50 +++- monitoring/ceph-mixin/dashboards_out/.lint | 5 + .../dashboards_out/cephfs-overview.json | 52 +++- .../dashboards_out/host-details.json | 73 ++++- .../dashboards_out/hosts-overview.json | 68 ++++- .../dashboards_out/osd-device-details.json | 72 ++++- .../dashboards_out/osds-overview.json | 92 ++++-- .../dashboards_out/pool-detail.json | 68 ++++- .../dashboards_out/pool-overview.json | 94 ++++-- .../dashboards_out/radosgw-detail.json | 67 ++++- .../dashboards_out/radosgw-overview.json | 143 ++++++--- .../dashboards_out/radosgw-sync-overview.json | 79 +++-- .../dashboards_out/rbd-details.json | 74 ++++- .../dashboards_out/rbd-overview.json | 59 +++- 22 files changed, 1341 insertions(+), 600 deletions(-) create mode 100644 monitoring/ceph-mixin/dashboards_out/.lint diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet index 0967ef424bce6..7c3216b36d489 100644 --- a/monitoring/ceph-mixin/config.libsonnet +++ b/monitoring/ceph-mixin/config.libsonnet @@ -1 +1,8 @@ -{} +{ + _config+:: { + dashboardTags: ['ceph-mixin'], + + clusterLabel: 'cluster', + showMultiCluster: false, + }, +} diff --git a/monitoring/ceph-mixin/dashboards/cephfs.libsonnet b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet index 3d09a54536de6..ef7434d7f8d36 100644 --- a/monitoring/ceph-mixin/dashboards/cephfs.libsonnet +++ b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet @@ -1,5 +1,6 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; +local c = (import '../mixin.libsonnet')._config; { grafanaDashboards+:: { @@ -28,7 +29,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '15s', 16, - [], + c.dashboardTags, '', { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -55,10 +56,16 @@ local u = import 'utils.libsonnet'; .addTemplate( g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) .addTemplate( u.addTemplateSchema('mds_servers', '$datasource', - 'label_values(ceph_mds_inodes, ceph_daemon)', + 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % u.matchers(), 1, true, 1, @@ -71,7 +78,7 @@ local u = import 'utils.libsonnet'; 'MDS Workload - $mds_servers', 'none', 'Reads(-) / Writes (+)', - 'sum(rate(ceph_objecter_op_r{ceph_daemon=~"($mds_servers).*"}[1m]))', + 'sum(rate(ceph_objecter_op_r{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % u.matchers(), 'Read Ops', 0, 1, @@ -79,7 +86,7 @@ local u = import 'utils.libsonnet'; 9 ) .addTarget(u.addTargetSchema( - 'sum(rate(ceph_objecter_op_w{ceph_daemon=~"($mds_servers).*"}[1m]))', + 'sum(rate(ceph_objecter_op_w{%(matchers)s, ceph_daemon=~"($mds_servers).*"}[$__rate_interval]))' % u.matchers(), 'Write Ops' )) .addSeriesOverride( @@ -89,7 +96,7 @@ local u = import 'utils.libsonnet'; 'Client Request Load - $mds_servers', 'none', 'Client Requests', - 'ceph_mds_server_handle_client_request{ceph_daemon=~"($mds_servers).*"}', + 'ceph_mds_server_handle_client_request{%(matchers)s, ceph_daemon=~"($mds_servers).*"}' % u.matchers(), '{{ceph_daemon}}', 12, 1, diff --git a/monitoring/ceph-mixin/dashboards/dashboards.libsonnet b/monitoring/ceph-mixin/dashboards/dashboards.libsonnet index 72ca483248f8e..d40025044fa56 100644 --- a/monitoring/ceph-mixin/dashboards/dashboards.libsonnet +++ b/monitoring/ceph-mixin/dashboards/dashboards.libsonnet @@ -1,3 +1,4 @@ +(import '../config.libsonnet') + (import 'cephfs.libsonnet') + (import 'host.libsonnet') + (import 'osd.libsonnet') + diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet index 7006e890ab770..a1b03b10ec8ba 100644 --- a/monitoring/ceph-mixin/dashboards/host.libsonnet +++ b/monitoring/ceph-mixin/dashboards/host.libsonnet @@ -1,5 +1,6 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; +local c = (import '../mixin.libsonnet')._config; { grafanaDashboards+:: { @@ -46,7 +47,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '10s', 16, - [], + c.dashboardTags, '', { refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -79,10 +80,16 @@ local u = import 'utils.libsonnet'; 'default', label='Data Source') ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) .addTemplate( u.addTemplateSchema('osd_hosts', '$datasource', - 'label_values(ceph_disk_occupation, exported_instance)', + 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % u.matchers(), 1, true, 1, @@ -92,7 +99,7 @@ local u = import 'utils.libsonnet'; .addTemplate( u.addTemplateSchema('mon_hosts', '$datasource', - 'label_values(ceph_mon_metadata, ceph_daemon)', + 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), 1, true, 1, @@ -102,7 +109,7 @@ local u = import 'utils.libsonnet'; .addTemplate( u.addTemplateSchema('mds_hosts', '$datasource', - 'label_values(ceph_mds_inodes, ceph_daemon)', + 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % u.matchers(), 1, true, 1, @@ -112,7 +119,7 @@ local u = import 'utils.libsonnet'; .addTemplate( u.addTemplateSchema('rgw_hosts', '$datasource', - 'label_values(ceph_rgw_metadata, ceph_daemon)', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), 1, true, 1, @@ -125,7 +132,7 @@ local u = import 'utils.libsonnet'; 'OSD Hosts', '', 'current', - 'count(sum by (hostname) (ceph_osd_metadata))', + 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % u.matchers(), true, 0, 0, @@ -140,8 +147,8 @@ local u = import 'utils.libsonnet'; ||| avg(1 - ( avg by(instance) ( - irate(node_cpu_seconds_total{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or - irate(node_cpu{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) + rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or + rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) ) )) |||, @@ -175,8 +182,7 @@ local u = import 'utils.libsonnet'; node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} ) ) - ) - ( + ) / ( node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} )) @@ -194,11 +200,11 @@ local u = import 'utils.libsonnet'; 'current', ||| sum (( - irate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[5m]) or - irate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[5m]) + rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or + rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) ) + ( - irate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[5m]) or - irate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[5m]) + rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or + rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) )) |||, true, @@ -215,17 +221,17 @@ local u = import 'utils.libsonnet'; ||| avg ( label_replace( - (irate(node_disk_io_time_ms[5m]) / 10 ) or - (irate(node_disk_io_time_seconds_total[5m]) * 100), + (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or + (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100), "instance", "$1", "instance", "([^.:]*).*" ) * on(instance, device) group_left(ceph_daemon) label_replace( label_replace( - ceph_disk_occupation_human{instance=~"($osd_hosts).*"}, + ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^.:]*).*" ) ) - |||, + ||| % u.matchers(), true, 16, 0, @@ -239,19 +245,19 @@ local u = import 'utils.libsonnet'; 'current', ||| sum ( - ( - irate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or - irate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) - ) unless on (device, instance) - label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ( + rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") ) + sum ( - ( - irate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or - irate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) - ) unless on (device, instance) - label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") - ) + ( + rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) ||| , true, @@ -269,8 +275,8 @@ local u = import 'utils.libsonnet'; 100 * ( 1 - ( avg by(instance) ( - irate(node_cpu_seconds_total{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or - irate(node_cpu{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) + rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or + rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) ) ) ) @@ -286,14 +292,14 @@ local u = import 'utils.libsonnet'; 'Network Load - Top 10 Hosts', 'Top 10 hosts by network load', 'Bps', ||| topk(10, (sum by(instance) ( ( - irate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or - irate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) + rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) ) + ( - irate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or - irate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) + rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) ) unless on (device, instance) - label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")) )) ||| , '{{instance}}', 12, 5, 12, 9 @@ -357,7 +363,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '10s', 16, - ['overview'], + c.dashboardTags + ['overview'], '', { refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -382,7 +388,20 @@ local u = import 'utils.libsonnet'; g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') ) .addTemplate( - u.addTemplateSchema('ceph_hosts', '$datasource', 'label_values(node_scrape_collector_success, instance) ', 1, false, 3, 'Hostname', '([^.:]*).*') + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) + .addTemplate( + u.addTemplateSchema('ceph_hosts', + '$datasource', + 'label_values({%(clusterMatcher)s}, instance)' % u.matchers(), + 1, + false, + 3, + 'Hostname', + '([^.:]*).*') ) .addPanels([ u.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, @@ -391,7 +410,7 @@ local u = import 'utils.libsonnet'; 'OSDs', '', 'current', - "count(sum by (ceph_daemon) (ceph_osd_metadata{hostname='$ceph_hosts'}))", + "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % u.matchers(), 0, 1, 3, @@ -412,12 +431,12 @@ local u = import 'utils.libsonnet'; '% Utilization', ||| sum by (mode) ( - irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m]) or - irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m]) + rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or + rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) ) / ( scalar( - sum(irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]) or - irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m])) + sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) ) * 100 ) |||, @@ -519,9 +538,9 @@ local u = import 'utils.libsonnet'; 'Send (-) / Receive (+)', ||| sum by (device) ( - irate( - node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or - irate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m] + rate( + node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or + rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval] ) ) |||, @@ -536,8 +555,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum by (device) ( - irate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or - irate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) + rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or + rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) ) |||, '{{device}}.tx' @@ -555,8 +574,8 @@ local u = import 'utils.libsonnet'; 'pps', 'Send (-) / Receive (+)', ||| - irate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or - irate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) + rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) |||, '{{device}}.rx', 21, @@ -568,8 +587,8 @@ local u = import 'utils.libsonnet'; [ u.addTargetSchema( ||| - irate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or - irate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) + rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) |||, '{{device}}.tx' ), @@ -588,10 +607,10 @@ local u = import 'utils.libsonnet'; 'current', ||| sum( - ceph_osd_stat_bytes and - on (ceph_daemon) ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?"} + ceph_osd_stat_bytes{%(matchers)s} and + on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"} ) - |||, + ||| % u.matchers(), 0, 6, 3, @@ -605,8 +624,8 @@ local u = import 'utils.libsonnet'; 'pps', 'Send (-) / Receive (+)', ||| - irate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or - irate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) + rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) |||, '{{device}}.rx', 21, @@ -617,8 +636,8 @@ local u = import 'utils.libsonnet'; .addTargets( [u.addTargetSchema( ||| - irate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or - irate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) + rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) |||, '{{device}}.tx' )] @@ -642,15 +661,15 @@ local u = import 'utils.libsonnet'; ||| label_replace( ( - irate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or - irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) + rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) ), "instance", "$1", "instance", "([^:.]*).*" ) * on(instance, device) group_left(ceph_daemon) label_replace( label_replace( - ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)" + ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}}({{ceph_daemon}}) writes', 0, 12, @@ -663,15 +682,15 @@ local u = import 'utils.libsonnet'; ||| label_replace( ( - irate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or - irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) + rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) ), "instance", "$1", "instance", "([^:.]*).*" ) * on(instance, device) group_left(ceph_daemon) label_replace( label_replace( - ceph_disk_occupation_human,"device", "$1", "device", "/dev/(.*)" + ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}}({{ceph_daemon}}) reads' ), ] @@ -689,14 +708,14 @@ local u = import 'utils.libsonnet'; ||| label_replace( ( - irate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or - irate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) + rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace( - label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), + label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}}({{ceph_daemon}}) write', 12, 12, @@ -708,15 +727,15 @@ local u = import 'utils.libsonnet'; ||| label_replace( ( - irate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or - irate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) + rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or + rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace( - label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), + label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}}({{ceph_daemon}}) read' )] ) @@ -732,10 +751,10 @@ local u = import 'utils.libsonnet'; '', ||| max by(instance, device) (label_replace( - (irate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])) / - clamp_min(irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001) or - (irate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])) / - clamp_min(irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001), + (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / + clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or + (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / + clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001), "instance", "$1", "instance", "([^:.]*).*" )) * on(instance, device) group_left(ceph_daemon) label_replace( label_replace( @@ -743,7 +762,7 @@ local u = import 'utils.libsonnet'; "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}}({{ceph_daemon}})', 0, 21, @@ -760,14 +779,14 @@ local u = import 'utils.libsonnet'; ||| label_replace( ( - (irate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) / 10) or - irate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) * 100 + (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or + rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100 ), "instance", "$1", "instance", "([^:.]*).*" ) * on(instance, device) group_left(ceph_daemon) label_replace( - label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, + label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}}({{ceph_daemon}})', 12, 21, diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet index 4c0623493abed..b08efa3637f03 100644 --- a/monitoring/ceph-mixin/dashboards/osd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -1,5 +1,6 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; +local c = (import '../mixin.libsonnet')._config; { grafanaDashboards+:: { @@ -84,7 +85,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '10s', 16, - [], + c.dashboardTags, '', { refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -117,6 +118,12 @@ local u = import 'utils.libsonnet'; .addTemplate( g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) .addPanels([ OsdOverviewGraphPanel( { '@95%ile': '#e0752d' }, @@ -125,7 +132,12 @@ local u = import 'utils.libsonnet'; 'ms', null, '0', - 'avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', + ||| + avg ( + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 + ) + ||| % u.matchers(), 'AVG read', 0, 0, @@ -137,22 +149,22 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| max( - irate(ceph_osd_op_r_latency_sum[1m]) / - on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000 + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 ) - |||, + ||| % u.matchers(), 'MAX read' ), u.addTargetSchema( ||| quantile(0.95, ( - irate(ceph_osd_op_r_latency_sum[1m]) / - on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 ) ) - |||, + ||| % u.matchers(), '@95%ile' ), ], @@ -175,13 +187,13 @@ local u = import 'utils.libsonnet'; topk(10, (sort( ( - irate(ceph_osd_op_r_latency_sum[1m]) / - on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * + rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000 ) )) ) - |||, + ||| % u.matchers(), '', 'table', 1, @@ -199,11 +211,11 @@ local u = import 'utils.libsonnet'; '0', ||| avg( - irate(ceph_osd_op_w_latency_sum[1m]) / - on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * 1000 ) - |||, + ||| % u.matchers(), 'AVG write', 12, 0, @@ -215,20 +227,20 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| max( - irate(ceph_osd_op_w_latency_sum[1m]) / - on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * 1000 ) - |||, 'MAX write' + ||| % u.matchers(), 'MAX write' ), u.addTargetSchema( ||| quantile(0.95, ( - irate(ceph_osd_op_w_latency_sum[1m]) / - on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * + rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * 1000 )) - |||, '@95%ile write' + ||| % u.matchers(), '@95%ile write' ), ], ), @@ -251,12 +263,12 @@ local u = import 'utils.libsonnet'; ||| topk(10, (sort( - (irate(ceph_osd_op_w_latency_sum[1m]) / - on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * + (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) * 1000) )) ) - |||, + ||| % u.matchers(), '', 'table', 1, @@ -267,50 +279,50 @@ local u = import 'utils.libsonnet'; {}, '', 'OSD Types Summary' ) .addTarget( - u.addTargetSchema('count by (device_class) (ceph_osd_metadata)', '{{device_class}}') + u.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % u.matchers(), '{{device_class}}') ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } }, OsdOverviewPieChartPanel( { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types' ) .addTarget( u.addTargetSchema( - 'count(ceph_bluefs_wal_total_bytes)', 'bluestore', 'time_series', 2 + 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % u.matchers(), 'bluestore', 'time_series', 2 ) ) .addTarget( u.addTargetSchema( - 'absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)', 'filestore', 'time_series', 2 + 'absent(ceph_bluefs_wal_total_bytes%(matchers)s) * count(ceph_osd_metadata{%(matchers)s})' % u.matchers(), 'filestore', 'time_series', 2 ) ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } }, OsdOverviewPieChartPanel( {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary' ) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes < 1099511627776)', '<1TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % u.matchers(), '<1TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)', '<2TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % u.matchers(), '<2TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)', '<3TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % u.matchers(), '<3TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)', '<4TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % u.matchers(), '<4TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)', '<6TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % u.matchers(), '<6TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)', '<8TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % u.matchers(), '<8TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)', '<10TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % u.matchers(), '<10TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)', '<12TB', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % u.matchers(), '<12TB', 'time_series', 2 )) .addTarget(u.addTargetSchema( - 'count(ceph_osd_stat_bytes >= 13194139533312)', '<12TB+', 'time_series', 2 + 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % u.matchers(), '<12TB+', 'time_series', 2 )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } }, g.graphPanel.new(bars=true, datasource='$datasource', @@ -324,7 +336,7 @@ local u = import 'utils.libsonnet'; min='0', nullPointMode='null') .addTarget(u.addTargetSchema( - 'ceph_osd_numpg', 'PGs per OSD', 'time_series', 1, true + 'ceph_osd_numpg{%(matchers)s}' % u.matchers(), 'PGs per OSD', 'time_series', 1, true )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } }, OsdOverviewSingleStatPanel( ['#d44a3a', '#299c46'], @@ -338,11 +350,11 @@ local u = import 'utils.libsonnet'; false, '.75', ||| - sum(ceph_bluestore_onode_hits) / ( - sum(ceph_bluestore_onode_hits) + - sum(ceph_bluestore_onode_misses) + sum(ceph_bluestore_onode_hits{%(matchers)s}) / ( + sum(ceph_bluestore_onode_hits{%(matchers)s}) + + sum(ceph_bluestore_onode_misses{%(matchers)s}) ) - |||, + ||| % u.matchers(), 20, 8, 4, @@ -358,7 +370,7 @@ local u = import 'utils.libsonnet'; 'short', null, null, - 'round(sum(irate(ceph_pool_rd[30s])))', + 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % u.matchers(), 'Reads', 0, 17, @@ -366,7 +378,7 @@ local u = import 'utils.libsonnet'; 8 ) .addTargets([u.addTargetSchema( - 'round(sum(irate(ceph_pool_wr[30s])))', 'Writes' + 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % u.matchers(), 'Writes' )]), ]), 'osd-device-details.json': @@ -409,7 +421,7 @@ local u = import 'utils.libsonnet'; 'now-3h', '', 16, - [], + c.dashboardTags, '', { refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -439,10 +451,16 @@ local u = import 'utils.libsonnet'; 'default', label='Data Source') ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) .addTemplate( u.addTemplateSchema('osd', '$datasource', - 'label_values(ceph_osd_metadata,ceph_daemon)', + 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), 1, false, 1, @@ -459,13 +477,13 @@ local u = import 'utils.libsonnet'; 's', 'Read (-) / Write (+)', ||| - irate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd"}[1m]) / - on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) - |||, + rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) + ||| % u.matchers(), ||| - irate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd"}[1m]) / - on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) - |||, + rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) / + on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) + ||| % u.matchers(), 'read', 'write', 0, @@ -484,8 +502,8 @@ local u = import 'utils.libsonnet'; '', 'short', 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r{ceph_daemon=~"$osd"}[1m])', - 'irate(ceph_osd_op_w{ceph_daemon=~"$osd"}[1m])', + 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), + 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), 'Reads', 'Writes', 6, @@ -501,8 +519,8 @@ local u = import 'utils.libsonnet'; '', 'bytes', 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd"}[1m])', - 'irate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd"}[1m])', + 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), + 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % u.matchers(), 'Read Bytes', 'Write Bytes', 12, @@ -522,28 +540,30 @@ local u = import 'utils.libsonnet'; ||| ( label_replace( - irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), + rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) / + rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" ) and on (instance, device) label_replace( label_replace( - ceph_disk_occupation_human{ceph_daemon=~"$osd"}, + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) ) - |||, + ||| % u.matchers(), ||| ( label_replace( - irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), + rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) / + rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace( label_replace( - ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) ) - |||, + ||| % u.matchers(), '{{instance}}/{{device}} Reads', '{{instance}}/{{device}} Writes', 0, @@ -561,26 +581,26 @@ local u = import 'utils.libsonnet'; 'Read (-) / Write (+)', ||| label_replace( - irate(node_disk_writes_completed_total[1m]), + rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" ) and on (instance, device) label_replace( label_replace( - ceph_disk_occupation_human{ceph_daemon=~"$osd"}, + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), ||| label_replace( - irate(node_disk_reads_completed_total[1m]), + rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" ) and on (instance, device) label_replace( label_replace( - ceph_disk_occupation_human{ceph_daemon=~"$osd"}, + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}} on {{instance}} Writes', '{{device}} on {{instance}} Reads', 6, @@ -598,24 +618,24 @@ local u = import 'utils.libsonnet'; 'Read (-) / Write (+)', ||| label_replace( - irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*" + rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" ) and on (instance, device) label_replace( label_replace( - ceph_disk_occupation_human{ceph_daemon=~"$osd"}, + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), ||| label_replace( - irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*" + rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" ) and on (instance, device) label_replace( label_replace( - ceph_disk_occupation_human{ceph_daemon=~"$osd"}, + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{instance}} {{device}} Reads', '{{instance}} {{device}} Writes', 12, @@ -643,14 +663,14 @@ local u = import 'utils.libsonnet'; .addTarget(u.addTargetSchema( ||| label_replace( - irate(node_disk_io_time_seconds_total[1m]), + rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*" ) and on (instance, device) label_replace( label_replace( - ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" + ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)" ), "instance", "$1", "instance", "([^:.]*).*" ) - |||, + ||| % u.matchers(), '{{device}} on {{instance}}' )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } }, ]), diff --git a/monitoring/ceph-mixin/dashboards/pool.libsonnet b/monitoring/ceph-mixin/dashboards/pool.libsonnet index 12f7c6789ec28..400a07e19e672 100644 --- a/monitoring/ceph-mixin/dashboards/pool.libsonnet +++ b/monitoring/ceph-mixin/dashboards/pool.libsonnet @@ -1,5 +1,6 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; +local c = (import '../mixin.libsonnet')._config; { grafanaDashboards+:: { @@ -85,7 +86,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '15s', 22, - [], + c.dashboardTags, '', { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -104,10 +105,13 @@ local u = import 'utils.libsonnet'; ) ) .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'Dashboard1', - label='Data Source') + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() ) .addTemplate( g.template.custom(label='TopK', @@ -121,7 +125,7 @@ local u = import 'utils.libsonnet'; 'Pools', '', 'avg', - 'count(ceph_pool_metadata)', + 'count(ceph_pool_metadata{%(matchers)s})' % u.matchers(), true, 'table', 0, @@ -134,7 +138,7 @@ local u = import 'utils.libsonnet'; 'Pools with Compression', 'Count of the pools that have compression enabled', 'current', - 'count(ceph_pool_metadata{compression_mode!="none"})', + 'count(ceph_pool_metadata{%(matchers)s, compression_mode!="none"})' % u.matchers(), null, '', 3, @@ -147,7 +151,7 @@ local u = import 'utils.libsonnet'; 'Total Raw Capacity', 'Total raw capacity available to the cluster', 'current', - 'sum(ceph_osd_stat_bytes)', + 'sum(ceph_osd_stat_bytes{%(matchers)s})' % u.matchers(), null, '', 6, @@ -160,7 +164,7 @@ local u = import 'utils.libsonnet'; 'Raw Capacity Consumed', 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', 'current', - 'sum(ceph_pool_bytes_used)', + 'sum(ceph_pool_bytes_used{%(matchers)s})' % u.matchers(), true, '', 9, @@ -173,7 +177,7 @@ local u = import 'utils.libsonnet'; 'Logical Stored ', 'Total of client data stored in the cluster', 'current', - 'sum(ceph_pool_stored)', + 'sum(ceph_pool_stored{%(matchers)s})' % u.matchers(), true, '', 12, @@ -186,7 +190,12 @@ local u = import 'utils.libsonnet'; 'Compression Savings', 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', 'current', - 'sum(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used)', + ||| + sum( + ceph_pool_compress_under_bytes{%(matchers)s} - + ceph_pool_compress_bytes_used{%(matchers)s} + ) + ||| % u.matchers(), null, '', 15, @@ -201,10 +210,10 @@ local u = import 'utils.libsonnet'; 'current', ||| ( - sum(ceph_pool_compress_under_bytes > 0) / - sum(ceph_pool_stored_raw and ceph_pool_compress_under_bytes > 0) + sum(ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + sum(ceph_pool_stored_raw{%(matchers)s} and ceph_pool_compress_under_bytes{%(matchers)s} > 0) ) * 100 - |||, + ||| % u.matchers(), null, 'table', 18, @@ -217,7 +226,12 @@ local u = import 'utils.libsonnet'; 'Compression Factor', 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', 'current', - 'sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_compress_bytes_used > 0)', + ||| + sum( + ceph_pool_compress_under_bytes{%(matchers)s} > 0) + / sum(ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ) + ||| % u.matchers(), null, '', 21, @@ -258,72 +272,98 @@ local u = import 'utils.libsonnet'; [ u.addTargetSchema( ||| - (ceph_pool_compress_under_bytes / ceph_pool_compress_bytes_used > 0) and on(pool_id) ( - ((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100 > 0.5 + ( + ceph_pool_compress_under_bytes{%(matchers)s} / + ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ) and on(pool_id) ( + ( + (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + ceph_pool_stored_raw{%(matchers)s} + ) * 100 > 0.5 ) - |||, + ||| % u.matchers(), 'A', 'table', 1, true ), u.addTargetSchema( - 'ceph_pool_max_avail * on(pool_id) group_left(name) ceph_pool_metadata', + ||| + ceph_pool_max_avail{%(matchers)s} * + on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} + ||| % u.matchers(), 'B', 'table', 1, true ), u.addTargetSchema( - '((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100', + ||| + ( + (ceph_pool_compress_under_bytes{%(matchers)s} > 0) / + ceph_pool_stored_raw{%(matchers)s} + ) * 100 + ||| % u.matchers(), 'C', 'table', 1, true ), u.addTargetSchema( - '(ceph_pool_percent_used * on(pool_id) group_left(name) ceph_pool_metadata)', + ||| + ceph_pool_percent_used{%(matchers)s} * + on(pool_id) group_left(name) ceph_pool_metadata{%(matchers)s} + ||| % u.matchers(), 'D', 'table', 1, true ), u.addTargetSchema( - '(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used > 0)', + ||| + ceph_pool_compress_under_bytes{%(matchers)s} - + ceph_pool_compress_bytes_used{%(matchers)s} > 0 + ||| % u.matchers(), 'E', 'table', 1, true ), u.addTargetSchema( - 'delta(ceph_pool_stored[5d])', 'F', 'table', 1, true + 'delta(ceph_pool_stored{%(matchers)s}[5d])' % u.matchers(), 'F', 'table', 1, true ), u.addTargetSchema( - 'rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])', + ||| + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + ||| % u.matchers(), 'G', 'table', 1, true ), u.addTargetSchema( - 'rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])', + ||| + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + ||| % u.matchers(), 'H', 'table', 1, true ), u.addTargetSchema( - 'ceph_pool_metadata', 'I', 'table', 1, true + 'ceph_pool_metadata{%(matchers)s}' % u.matchers(), 'I', 'table', 1, true ), u.addTargetSchema( - 'ceph_pool_stored * on(pool_id) group_left ceph_pool_metadata', + 'ceph_pool_stored{%(matchers)s} * on(pool_id) group_left ceph_pool_metadata{%(matchers)s}' % u.matchers(), 'J', 'table', 1, true ), u.addTargetSchema( - 'ceph_pool_metadata{compression_mode!="none"}', 'K', 'table', 1, true + 'ceph_pool_metadata{%(matchers)s, compression_mode!="none"}' % u.matchers(), 'K', 'table', 1, true ), u.addTargetSchema('', 'L', '', '', null), ] @@ -336,10 +376,12 @@ local u = import 'utils.libsonnet'; ||| topk($topk, round( - (rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])), - 1 - ) * on(pool_id) group_left(instance,name) ceph_pool_metadata) - |||, + ( + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + ), 1 + ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s}) + ||| % u.matchers(), '{{name}} ', 0, 9, @@ -350,10 +392,10 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| topk($topk, - rate(ceph_pool_wr[30s]) + - on(pool_id) group_left(instance,name) ceph_pool_metadata + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s} ) - |||, + ||| % u.matchers(), '{{name}} - write' ) ), @@ -364,10 +406,12 @@ local u = import 'utils.libsonnet'; 'Throughput', ||| topk($topk, - (rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])) * - on(pool_id) group_left(instance, name) ceph_pool_metadata + ( + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s} ) - |||, + ||| % u.matchers(), '{{name}}', 12, 9, @@ -379,7 +423,7 @@ local u = import 'utils.libsonnet'; 'Historical view of capacity usage, to help identify growth and trends in pool consumption', 'bytes', 'Capacity Used', - 'ceph_pool_bytes_used * on(pool_id) group_right ceph_pool_metadata', + 'ceph_pool_bytes_used{%(matchers)s} * on(pool_id) group_right ceph_pool_metadata{%(matchers)s}' % u.matchers(), '{{name}}', 0, 17, @@ -450,7 +494,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '15s', 22, - [], + c.dashboardTags, '', { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -478,15 +522,18 @@ local u = import 'utils.libsonnet'; ) ) .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'Prometheus admin.virt1.home.fajerski.name:9090', - label='Data Source') + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() ) .addTemplate( u.addTemplateSchema('pool_name', '$datasource', - 'label_values(ceph_pool_metadata,name)', + 'label_values(ceph_pool_metadata{%(matchers)s}, name)' % u.matchers(), 1, false, 1, @@ -505,9 +552,9 @@ local u = import 'utils.libsonnet'; true, '.7,.8', ||| - (ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * - on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} - |||, + (ceph_pool_stored{%(matchers)s} / (ceph_pool_stored{%(matchers)s} + ceph_pool_max_avail{%(matchers)s})) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % u.matchers(), 'time_series', 0, 0, @@ -517,7 +564,7 @@ local u = import 'utils.libsonnet'; PoolDetailSingleStatPanel( 's', 'Time till full', - 'Time till pool is full assuming the average fill rate of the last 6 hours', + 'Time till pool is full assuming the average fill rate of the last 4 hours', false, 100, false, @@ -525,9 +572,9 @@ local u = import 'utils.libsonnet'; '', 'current', ||| - (ceph_pool_max_avail / deriv(ceph_pool_stored[6h])) * - on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} > 0 - |||, + (ceph_pool_max_avail{%(matchers)s} / deriv(ceph_pool_stored{%(matchers)s}[6h])) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} > 0 + ||| % u.matchers(), 'time_series', 7, 0, @@ -545,9 +592,9 @@ local u = import 'utils.libsonnet'; 'ops', 'Objects out(-) / in(+) ', ||| - deriv(ceph_pool_objects[1m]) * - on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} - |||, + deriv(ceph_pool_objects{%(matchers)s}[1m]) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % u.matchers(), 'Objects per second', 12, 0, @@ -564,9 +611,9 @@ local u = import 'utils.libsonnet'; 'iops', 'Read (-) / Write (+)', ||| - irate(ceph_pool_rd[1m]) * - on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} - |||, + rate(ceph_pool_rd{%(matchers)s}[$__rate_interval]) * + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % u.matchers(), 'reads', 0, 7, @@ -577,9 +624,9 @@ local u = import 'utils.libsonnet'; .addTarget( u.addTargetSchema( ||| - irate(ceph_pool_wr[1m]) * - on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} - |||, + rate(ceph_pool_wr{%(matchers)s}[$__rate_interval]) * + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % u.matchers(), 'writes' ) ), @@ -593,9 +640,9 @@ local u = import 'utils.libsonnet'; 'Bps', 'Read (-) / Write (+)', ||| - irate(ceph_pool_rd_bytes[1m]) + - on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~"$pool_name"} - |||, + rate(ceph_pool_rd_bytes{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance, name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % u.matchers(), 'reads', 12, 7, @@ -606,9 +653,9 @@ local u = import 'utils.libsonnet'; .addTarget( u.addTargetSchema( ||| - irate(ceph_pool_wr_bytes[1m]) + - on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} - |||, + rate(ceph_pool_wr_bytes{%(matchers)s}[$__rate_interval]) + + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % u.matchers(), 'writes' ) ), @@ -622,9 +669,9 @@ local u = import 'utils.libsonnet'; 'short', 'Objects', ||| - ceph_pool_objects * - on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} - |||, + ceph_pool_objects{%(matchers)s} * + on(pool_id) group_left(instance,name) ceph_pool_metadata{%(matchers)s, name=~"$pool_name"} + ||| % u.matchers(), 'Number of Objects', 0, 14, diff --git a/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/monitoring/ceph-mixin/dashboards/rbd.libsonnet index 88e2568ecedfd..ba3db60e53813 100644 --- a/monitoring/ceph-mixin/dashboards/rbd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rbd.libsonnet @@ -1,5 +1,6 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; +local c = (import '../mixin.libsonnet')._config; { grafanaDashboards+:: { @@ -16,7 +17,7 @@ local u = import 'utils.libsonnet'; null, 0, 1, - '$Datasource') + '$datasource') .addTargets( [ u.addTargetSchema(expr1, @@ -32,7 +33,7 @@ local u = import 'utils.libsonnet'; 'now-1h', false, 16, - [], + c.dashboardTags, '', { refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -57,11 +58,17 @@ local u = import 'utils.libsonnet'; type='panel', id='graph', name='Graph', version='5.0.0' ) .addTemplate( - g.template.datasource('Datasource', 'prometheus', 'default', label=null) + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() ) .addTemplate( - u.addTemplateSchema('Pool', - '$Datasource', + u.addTemplateSchema('pool', + '$datasource', 'label_values(pool)', 1, false, @@ -70,8 +77,8 @@ local u = import 'utils.libsonnet'; '') ) .addTemplate( - u.addTemplateSchema('Image', - '$Datasource', + u.addTemplateSchema('image', + '$datasource', 'label_values(image)', 1, false, @@ -83,8 +90,9 @@ local u = import 'utils.libsonnet'; RbdDetailsPanel( 'IOPS', 'iops', - 'irate(ceph_rbd_write_ops{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_ops{pool="$Pool", image="$Image"}[30s])', + 'rate(ceph_rbd_write_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers() + , + 'rate(ceph_rbd_read_ops{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(), 0, 0, 8, @@ -93,8 +101,8 @@ local u = import 'utils.libsonnet'; RbdDetailsPanel( 'Throughput', 'Bps', - 'irate(ceph_rbd_write_bytes{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_bytes{pool="$Pool", image="$Image"}[30s])', + 'rate(ceph_rbd_write_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(), + 'rate(ceph_rbd_read_bytes{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval])' % u.matchers(), 8, 0, 8, @@ -104,13 +112,13 @@ local u = import 'utils.libsonnet'; 'Average Latency', 'ns', ||| - irate(ceph_rbd_write_latency_sum{pool="$Pool", image="$Image"}[30s]) / - irate(ceph_rbd_write_latency_count{pool="$Pool", image="$Image"}[30s]) - |||, + rate(ceph_rbd_write_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) / + rate(ceph_rbd_write_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) + ||| % u.matchers(), ||| - irate(ceph_rbd_read_latency_sum{pool="$Pool", image="$Image"}[30s]) / - irate(ceph_rbd_read_latency_count{pool="$Pool", image="$Image"}[30s]) - |||, + rate(ceph_rbd_read_latency_sum{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) / + rate(ceph_rbd_read_latency_count{%(matchers)s, pool="$pool", image="$image"}[$__rate_interval]) + ||| % u.matchers(), 16, 0, 8, @@ -168,7 +176,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '30s', 16, - ['overview'], + c.dashboardTags + ['overview'], '', { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -199,17 +207,20 @@ local u = import 'utils.libsonnet'; type='panel', id='table', name='Table', version='5.0.0' ) .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'default', - label='Data Source') + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() ) .addPanels([ RbdOverviewPanel( 'IOPS', 'short', - 'round(sum(irate(ceph_rbd_write_ops[30s])))', - 'round(sum(irate(ceph_rbd_read_ops[30s])))', + 'round(sum(rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval])))' % u.matchers(), + 'round(sum(rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval])))' % u.matchers(), 'Writes', 'Reads', 0, @@ -220,8 +231,8 @@ local u = import 'utils.libsonnet'; RbdOverviewPanel( 'Throughput', 'Bps', - 'round(sum(irate(ceph_rbd_write_bytes[30s])))', - 'round(sum(irate(ceph_rbd_read_bytes[30s])))', + 'round(sum(rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval])))' % u.matchers(), + 'round(sum(rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval])))' % u.matchers(), 'Write', 'Read', 8, @@ -234,16 +245,16 @@ local u = import 'utils.libsonnet'; 'ns', ||| round( - sum(irate(ceph_rbd_write_latency_sum[30s])) / - sum(irate(ceph_rbd_write_latency_count[30s])) + sum(rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval])) / + sum(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval])) ) - |||, + ||| % u.matchers(), ||| round( - sum(irate(ceph_rbd_read_latency_sum[30s])) / - sum(irate(ceph_rbd_read_latency_count[30s])) + sum(rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval])) / + sum(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval])) ) - |||, + ||| % u.matchers(), 'Write', 'Read', 16, @@ -270,12 +281,12 @@ local u = import 'utils.libsonnet'; topk(10, ( sort(( - irate(ceph_rbd_write_ops[30s]) + - on (image, pool, namespace) irate(ceph_rbd_read_ops[30s]) + rate(ceph_rbd_write_ops{%(matchers)s}[$__rate_interval]) + + on (image, pool, namespace) rate(ceph_rbd_read_ops{%(matchers)s}[$__rate_interval]) )) ) ) - |||, + ||| % u.matchers(), '', 'table', 1, @@ -301,11 +312,12 @@ local u = import 'utils.libsonnet'; topk(10, sort( sum( - irate(ceph_rbd_read_bytes[30s]) + irate(ceph_rbd_write_bytes[30s]) + rate(ceph_rbd_read_bytes{%(matchers)s}[$__rate_interval]) + + rate(ceph_rbd_write_bytes{%(matchers)s}[$__rate_interval]) ) by (pool, image, namespace) ) ) - |||, + ||| % u.matchers(), '', 'table', 1, @@ -330,11 +342,13 @@ local u = import 'utils.libsonnet'; ||| topk(10, sum( - irate(ceph_rbd_write_latency_sum[30s]) / clamp_min(irate(ceph_rbd_write_latency_count[30s]), 1) + - irate(ceph_rbd_read_latency_sum[30s]) / clamp_min(irate(ceph_rbd_read_latency_count[30s]), 1) + rate(ceph_rbd_write_latency_sum{%(matchers)s}[$__rate_interval]) / + clamp_min(rate(ceph_rbd_write_latency_count{%(matchers)s}[$__rate_interval]), 1) + + rate(ceph_rbd_read_latency_sum{%(matchers)s}[$__rate_interval]) / + clamp_min(rate(ceph_rbd_read_latency_count{%(matchers)s}[$__rate_interval]), 1) ) by (pool, image, namespace) ) - |||, + ||| % u.matchers(), '', 'table', 1, diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet index 9c3333ebde739..2e17bb75b53ee 100644 --- a/monitoring/ceph-mixin/dashboards/rgw.libsonnet +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -1,5 +1,6 @@ local g = import 'grafonnet/grafana.libsonnet'; local u = import 'utils.libsonnet'; +local c = (import '../mixin.libsonnet')._config; { grafanaDashboards+:: { @@ -18,8 +19,13 @@ local u = import 'utils.libsonnet'; 1, '$datasource') .addTargets( - [u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric, - '{{source_zone}}')] + [ + u.addTargetSchema( + 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))' + % (u.matchers() + { rgwMetric: rgwMetric }), + '{{source_zone}}' + ), + ] ) + { gridPos: { x: x, y: y, w: w, h: h } }; u.dashboardSchema( @@ -29,7 +35,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '15s', 16, - ['overview'], + c.dashboardTags + ['overview'], '', { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -54,10 +60,25 @@ local u = import 'utils.libsonnet'; type='panel', id='graph', name='Graph', version='5.0.0' ) .addTemplate( - u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '') + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') ) .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) + .addTemplate( + u.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), + 1, + true, + 1, + '', + 'RGW Server' + ) ) .addPanels([ RgwSyncOverviewPanel( @@ -152,7 +173,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '15s', 16, - ['overview'], + c.dashboardTags + ['overview'], '', { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -176,23 +197,35 @@ local u = import 'utils.libsonnet'; .addRequired( type='panel', id='graph', name='Graph', version='5.0.0' ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) .addTemplate( u.addTemplateSchema( 'rgw_servers', '$datasource', - 'label_values(ceph_rgw_metadata, ceph_daemon)', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), 1, true, 1, '', - '' + 'RGW Server' ) ) .addTemplate( u.addTemplateSchema( 'code', '$datasource', - 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', + 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)', 1, true, 1, @@ -200,11 +233,25 @@ local u = import 'utils.libsonnet'; '' ) ) + .addTemplate( + u.addTemplateSchema( + 'job_haproxy', + '$datasource', + 'label_values(haproxy_server_status, job)', + 1, + true, + 1, + 'job haproxy', + '(.*)', + multi=true, + allValues='.+', + ), + ) .addTemplate( u.addTemplateSchema( 'ingress_service', '$datasource', - 'label_values(haproxy_server_status, instance)', + 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)', 1, true, 1, @@ -212,12 +259,6 @@ local u = import 'utils.libsonnet'; '' ) ) - .addTemplate( - g.template.datasource('datasource', - 'prometheus', - 'default', - label='Data Source') - ) .addPanels([ u.addRowSchema(false, true, @@ -231,10 +272,10 @@ local u = import 'utils.libsonnet'; 's', 'short', ||| - rate(ceph_rgw_get_initial_lat_sum[30s]) / - rate(ceph_rgw_get_initial_lat_count[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata - |||, + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s} + ||| % u.matchers(), 'GET AVG', 0, 1, @@ -244,10 +285,10 @@ local u = import 'utils.libsonnet'; [ u.addTargetSchema( ||| - rate(ceph_rgw_put_initial_lat_sum[30s]) / - rate(ceph_rgw_put_initial_lat_count[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata - |||, + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s} + ||| % u.matchers(), 'PUT AVG' ), ] @@ -260,12 +301,12 @@ local u = import 'utils.libsonnet'; ||| sum by (rgw_host) ( label_replace( - rate(ceph_rgw_req[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) ) - |||, + ||| % u.matchers(), '{{rgw_host}}', 8, 1, @@ -279,12 +320,12 @@ local u = import 'utils.libsonnet'; 'short', ||| label_replace( - rate(ceph_rgw_get_initial_lat_sum[30s]) / - rate(ceph_rgw_get_initial_lat_count[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) - |||, + ||| % u.matchers(), '{{rgw_host}}', 15, 1, @@ -296,14 +337,14 @@ local u = import 'utils.libsonnet'; 'Total bytes transferred in/out of all radosgw instances within the cluster', 'bytes', 'short', - 'sum(rate(ceph_rgw_get_b[30s]))', + 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % u.matchers(), 'GETs', 0, 8, 8, 6 ).addTargets( - [u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))', + [u.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % u.matchers(), 'PUTs')] ), RgwOverviewPanel( @@ -313,12 +354,12 @@ local u = import 'utils.libsonnet'; 'short', ||| label_replace(sum by (instance_id) ( - rate(ceph_rgw_get_b[30s]) + - rate(ceph_rgw_put_b[30s])) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, + rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) - |||, + ||| % u.matchers(), '{{rgw_host}}', 8, 8, @@ -332,12 +373,12 @@ local u = import 'utils.libsonnet'; 'short', ||| label_replace( - rate(ceph_rgw_put_initial_lat_sum[30s]) / - rate(ceph_rgw_put_initial_lat_count[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)" ) - |||, + ||| % u.matchers(), '{{rgw_host}}', 15, 8, @@ -354,8 +395,8 @@ local u = import 'utils.libsonnet'; 'short', ||| sum( - irate( - haproxy_frontend_http_responses_total{code=~"$code", instance=~"$ingress_service", proxy=~"frontend"}[5m] + rate( + haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval] ) ) by (code) |||, @@ -377,8 +418,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_http_responses_total{code=~"$code", instance=~"$ingress_service", proxy=~"backend"}[5m] + rate( + haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval] ) ) by (code) |||, 'Backend {{ code }}' @@ -404,8 +445,8 @@ local u = import 'utils.libsonnet'; 'short', ||| sum( - irate( - haproxy_frontend_http_requests_total{proxy=~"frontend", instance=~"$ingress_service"}[5m] + rate( + haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, @@ -427,8 +468,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m] + rate( + haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, 'Response errors', 'time_series', 2 @@ -436,8 +477,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_frontend_request_errors_total{proxy=~"frontend", instance=~"$ingress_service"}[5m] + rate( + haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, 'Requests errors' @@ -445,8 +486,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_redispatch_warnings_total{proxy=~"backend", instance=~"$ingress_service"}[5m] + rate( + haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, 'Backend redispatch', 'time_series', 2 @@ -454,8 +495,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_retry_warnings_total{proxy=~"backend", instance=~"$ingress_service"}[5m] + rate( + haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, 'Backend retry', 'time_series', 2 @@ -463,8 +504,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_frontend_requests_denied_total{proxy=~"frontend", instance=~"$ingress_service"}[5m] + rate( + haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, 'Request denied', 'time_series', 2 @@ -472,7 +513,7 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - haproxy_backend_current_queue{proxy=~"backend", instance=~"$ingress_service"} + haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"} ) by (instance) |||, 'Backend Queued', 'time_series', 2 ), @@ -495,8 +536,8 @@ local u = import 'utils.libsonnet'; 'short', ||| sum( - irate( - haproxy_frontend_connections_total{proxy=~"frontend", instance=~"$ingress_service"}[5m] + rate( + haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, @@ -518,8 +559,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_connection_attempts_total{proxy=~"backend", instance=~"$ingress_service"}[5m] + rate( + haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, 'Back' @@ -527,8 +568,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_connection_errors_total{proxy=~"backend", instance=~"$ingress_service"}[5m] + rate( + haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) ) by (instance) |||, 'Back errors' @@ -548,8 +589,8 @@ local u = import 'utils.libsonnet'; 'short', ||| sum( - irate( - haproxy_frontend_bytes_in_total{proxy=~"frontend", instance=~"$ingress_service"}[5m] + rate( + haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) * 8 ) by (instance) |||, @@ -571,8 +612,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_frontend_bytes_out_total{proxy=~"frontend", instance=~"$ingress_service"}[5m] + rate( + haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) * 8 ) by (instance) |||, 'OUT Front', 'time_series', 2 @@ -580,8 +621,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_bytes_in_total{proxy=~"backend", instance=~"$ingress_service"}[5m] + rate( + haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) * 8 ) by (instance) |||, 'IN Back', 'time_series', 2 @@ -589,8 +630,8 @@ local u = import 'utils.libsonnet'; u.addTargetSchema( ||| sum( - irate( - haproxy_backend_bytes_out_total{proxy=~"backend", instance=~"$ingress_service"}[5m] + rate( + haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval] ) * 8 ) by (instance) |||, 'OUT Back', 'time_series', 2 @@ -641,7 +682,7 @@ local u = import 'utils.libsonnet'; 'now-1h', '15s', 16, - ['overview'], + c.dashboardTags + ['overview'], '', { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], @@ -677,10 +718,16 @@ local u = import 'utils.libsonnet'; 'default', label='Data Source') ) + .addTemplate( + u.addClusterTemplate() + ) + .addTemplate( + u.addJobTemplate() + ) .addTemplate( u.addTemplateSchema('rgw_servers', '$datasource', - 'label_values(ceph_rgw_metadata, ceph_daemon)', + 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % u.matchers(), 1, true, 1, @@ -697,14 +744,16 @@ local u = import 'utils.libsonnet'; 'short', ||| sum by (instance_id) ( - rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) - ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), ||| sum by (instance_id) ( - rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) - ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) / + rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'GET {{ceph_daemon}}', 'PUT {{ceph_daemon}}', 0, @@ -719,13 +768,14 @@ local u = import 'utils.libsonnet'; 'bytes', 'short', ||| - rate(ceph_rgw_get_b[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), ||| - rate(ceph_rgw_put_b[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) + ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'GETs {{ceph_daemon}}', 'PUTs {{ceph_daemon}}', 6, @@ -746,13 +796,13 @@ local u = import 'utils.libsonnet'; 'short', 'short', ||| - rate(ceph_rgw_failed_req[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), ||| - rate(ceph_rgw_get[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'Requests Failed {{ceph_daemon}}', 'GETs {{ceph_daemon}}', 13, @@ -764,18 +814,21 @@ local u = import 'utils.libsonnet'; [ u.addTargetSchema( ||| - rate(ceph_rgw_put[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'PUTs {{ceph_daemon}}' ), u.addTargetSchema( ||| ( - rate(ceph_rgw_req[30s]) - - (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s])) - ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) + ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'Other {{ceph_daemon}}' ), ] @@ -791,33 +844,36 @@ local u = import 'utils.libsonnet'; ) .addTarget(u.addTargetSchema( ||| - rate(ceph_rgw_failed_req[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'Failures {{ceph_daemon}}' )) .addTarget(u.addTargetSchema( ||| - rate(ceph_rgw_get[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'GETs {{ceph_daemon}}' )) .addTarget(u.addTargetSchema( ||| - rate(ceph_rgw_put[30s]) * - on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) * + on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'PUTs {{ceph_daemon}}' )) .addTarget(u.addTargetSchema( ||| ( - rate(ceph_rgw_req[30s]) - - (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s])) + rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) - + ( + rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) + + rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) + ) ) * on (instance_id) group_left (ceph_daemon) - ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"} - |||, + ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"} + ||| % u.matchers(), 'Other (DELETE,LIST) {{ceph_daemon}}' )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } }, ]), diff --git a/monitoring/ceph-mixin/dashboards/utils.libsonnet b/monitoring/ceph-mixin/dashboards/utils.libsonnet index 1f25d370c2a13..b5d3500a4e31e 100644 --- a/monitoring/ceph-mixin/dashboards/utils.libsonnet +++ b/monitoring/ceph-mixin/dashboards/utils.libsonnet @@ -1,4 +1,5 @@ local g = import 'grafonnet/grafana.libsonnet'; +local c = (import '../mixin.libsonnet')._config; { dashboardSchema(title, @@ -72,7 +73,10 @@ local g = import 'grafonnet/grafana.libsonnet'; includeAll, sort, label, - regex):: + regex, + hide='', + multi=false, + allValues=null):: g.template.new(name=name, datasource=datasource, query=query, @@ -80,7 +84,10 @@ local g = import 'grafonnet/grafana.libsonnet'; includeAll=includeAll, sort=sort, label=label, - regex=regex), + regex=regex, + hide=hide, + multi=multi, + allValues=allValues), addAnnotationSchema(builtIn, datasource, @@ -170,4 +177,43 @@ local g = import 'grafonnet/grafana.libsonnet'; unit: unit, valueMaps: valueMaps, }, + + matchers():: + local jobMatcher = 'job=~"$job"'; + local clusterMatcher = '%s=~"$cluster"' % c.clusterLabel; + { + // Common labels + jobMatcher: jobMatcher, + clusterMatcher: clusterMatcher, + matchers: '%s, %s' % [jobMatcher, clusterMatcher], + }, + + addClusterTemplate():: + $.addTemplateSchema( + 'cluster', + '$datasource', + 'label_values(ceph_osd_metadata, cluster)', + 1, + true, + 1, + 'cluster', + '(.*)', + if !c.showMultiCluster then 'variable' else '', + multi=true, + allValues='.+', + ), + + addJobTemplate():: + $.addTemplateSchema( + 'job', + '$datasource', + 'label_values(ceph_osd_metadata{%(clusterMatcher)s}, job)' % $.matchers(), + 1, + true, + 1, + 'job', + '(.*)', + multi=true, + allValues='.+', + ), } diff --git a/monitoring/ceph-mixin/dashboards_out/.lint b/monitoring/ceph-mixin/dashboards_out/.lint new file mode 100644 index 0000000000000..6352e858f28d0 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards_out/.lint @@ -0,0 +1,5 @@ +exclusions: + template-instance-rule: + reason: "Instance template not needed because of ceph-mgr leader election." + target-instance-rule: + reason: "Instance matcher not needed because of ceph-mgr leader election." diff --git a/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json index 5c0c27329d69a..79dd870daa5a6 100644 --- a/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json @@ -104,14 +104,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ceph_objecter_op_r{ceph_daemon=~\"($mds_servers).*\"}[1m]))", + "expr": "sum(rate(ceph_objecter_op_r{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"($mds_servers).*\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Read Ops", "refId": "A" }, { - "expr": "sum(rate(ceph_objecter_op_w{ceph_daemon=~\"($mds_servers).*\"}[1m]))", + "expr": "sum(rate(ceph_objecter_op_w{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"($mds_servers).*\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Write Ops", @@ -197,7 +197,7 @@ "steppedLine": false, "targets": [ { - "expr": "ceph_mds_server_handle_client_request{ceph_daemon=~\"($mds_servers).*\"}", + "expr": "ceph_mds_server_handle_client_request{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"($mds_servers).*\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ceph_daemon}}", @@ -245,7 +245,9 @@ "rows": [ ], "schemaVersion": 16, "style": "dark", - "tags": [ ], + "tags": [ + "ceph-mixin" + ], "templating": { "list": [ { @@ -262,6 +264,46 @@ "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, @@ -272,7 +314,7 @@ "multi": false, "name": "mds_servers", "options": [ ], - "query": "label_values(ceph_mds_inodes, ceph_daemon)", + "query": "label_values(ceph_mds_inodes{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, "regex": "", "sort": 1, diff --git a/monitoring/ceph-mixin/dashboards_out/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json index 9abc6683ca0f9..defb80f207ebe 100644 --- a/monitoring/ceph-mixin/dashboards_out/host-details.json +++ b/monitoring/ceph-mixin/dashboards_out/host-details.json @@ -123,7 +123,7 @@ "tableColumn": "", "targets": [ { - "expr": "count(sum by (ceph_daemon) (ceph_osd_metadata{hostname='$ceph_hosts'}))", + "expr": "count(sum by (ceph_daemon) (ceph_osd_metadata{job=~\"$job\", cluster=~\"$cluster\", hostname='$ceph_hosts'}))", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -192,7 +192,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (mode) (\n irate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m]) or\n irate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m])\n) / (\n scalar(\n sum(irate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[1m]) or\n irate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[1m]))\n ) * 100\n)\n", + "expr": "sum by (mode) (\n rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval])\n) / (\n scalar(\n sum(rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]))\n ) * 100\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{mode}}", @@ -405,14 +405,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (device) (\n irate(\n node_network_receive_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[1m]\n )\n)\n", + "expr": "sum by (device) (\n rate(\n node_network_receive_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}.rx", "refId": "A" }, { - "expr": "sum by (device) (\n irate(node_network_transmit_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[1m])\n)\n", + "expr": "sum by (device) (\n rate(node_network_transmit_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval])\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}.tx", @@ -503,14 +503,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_network_receive_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m]) or\n irate(node_network_receive_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m])\n", + "expr": "rate(node_network_receive_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}.rx", "refId": "A" }, { - "expr": "irate(node_network_transmit_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m]) or\n irate(node_network_transmit_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m])\n", + "expr": "rate(node_network_transmit_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}.tx", @@ -615,7 +615,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(\n ceph_osd_stat_bytes and\n on (ceph_daemon) ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}\n)\n", + "expr": "sum(\n ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} and\n on (ceph_daemon) ceph_disk_occupation{job=~\"$job\", cluster=~\"$cluster\", instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -683,14 +683,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_network_receive_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m]) or\n irate(node_network_receive_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m])\n", + "expr": "rate(node_network_receive_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}.rx", "refId": "A" }, { - "expr": "irate(node_network_transmit_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m]) or\n irate(node_network_transmit_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[1m])\n", + "expr": "rate(node_network_transmit_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}.tx", @@ -800,14 +800,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n (\n irate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n (\n rate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}}) writes", "refId": "A" }, { - "expr": "label_replace(\n (\n irate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]) or\n irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human,\"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n (\n rate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\"},\"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}}) reads", @@ -898,14 +898,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n (\n irate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]) or\n irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "expr": "label_replace(\n (\n rate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}}) write", "refId": "A" }, { - "expr": "label_replace(\n (\n irate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]) or\n irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m])\n ),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "expr": "label_replace(\n (\n rate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}}) read", @@ -991,7 +991,7 @@ "steppedLine": false, "targets": [ { - "expr": "max by(instance, device) (label_replace(\n (irate(node_disk_write_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m])) /\n clamp_min(irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]), 0.001) or\n (irate(node_disk_read_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m])) /\n clamp_min(irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]), 0.001),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "max by(instance, device) (label_replace(\n (rate(node_disk_write_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001) or\n (rate(node_disk_read_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}})", @@ -1077,7 +1077,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n (\n (irate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]) / 10) or\n irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[5m]) * 100\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n (\n (rate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) / 10) or\n rate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) * 100\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}})", @@ -1126,6 +1126,7 @@ "schemaVersion": 16, "style": "dark", "tags": [ + "ceph-mixin", "overview" ], "templating": { @@ -1144,6 +1145,46 @@ "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, @@ -1154,7 +1195,7 @@ "multi": false, "name": "ceph_hosts", "options": [ ], - "query": "label_values(node_scrape_collector_success, instance) ", + "query": "label_values({cluster=~\"$cluster\"}, instance)", "refresh": 1, "regex": "([^.:]*).*", "sort": 3, diff --git a/monitoring/ceph-mixin/dashboards_out/hosts-overview.json b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json index d43696988a90e..89f65a0d350f0 100644 --- a/monitoring/ceph-mixin/dashboards_out/hosts-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json @@ -104,7 +104,7 @@ "tableColumn": "", "targets": [ { - "expr": "count(sum by (hostname) (ceph_osd_metadata))", + "expr": "count(sum by (hostname) (ceph_osd_metadata{job=~\"$job\", cluster=~\"$cluster\"}))", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -187,7 +187,7 @@ "tableColumn": "", "targets": [ { - "expr": "avg(1 - (\n avg by(instance) (\n irate(node_cpu_seconds_total{mode=\\'idle\\',instance=~\\\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\\\"}[1m]) or\n irate(node_cpu{mode=\\'idle\\',instance=~\\\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\\\"}[1m])\n )\n))\n", + "expr": "avg(1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n))\n", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -270,7 +270,7 @@ "tableColumn": "", "targets": [ { - "expr": "avg ((\n (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) - ((\n node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (\n node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n )\n )\n)\n(\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"}\n))\n", + "expr": "avg ((\n (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) - ((\n node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (\n node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n )\n )\n) / (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"}\n))\n", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -353,7 +353,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum ((\n irate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[5m]) or\n irate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[5m])\n) + (\n irate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[5m])\n))\n", + "expr": "sum ((\n rate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n) + (\n rate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n))\n", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -436,7 +436,7 @@ "tableColumn": "", "targets": [ { - "expr": "avg (\n label_replace(\n (irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100),\n \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($osd_hosts).*\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n )\n)\n", + "expr": "avg (\n label_replace(\n (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or\n (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),\n \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", instance=~\"($osd_hosts).*\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n )\n)\n", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -519,7 +519,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum (\n (\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n )\n", + "expr": "sum (\n (\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n)\n", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -583,7 +583,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,\n 100 * (\n 1 - (\n avg by(instance) (\n irate(node_cpu_seconds_total{mode=\\'idle\\',instance=~\\\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\\\"}[1m]) or\n irate(node_cpu{mode=\\'idle\\',instance=~\\\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\\\"}[1m])\n )\n )\n )\n)\n", + "expr": "topk(10,\n 100 * (\n 1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n )\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", @@ -669,7 +669,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, (sum by(instance) (\n(\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) +\n(\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", + "expr": "topk(10, (sum by(instance) (\n(\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) +\n(\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", @@ -717,7 +717,9 @@ "rows": [ ], "schemaVersion": 16, "style": "dark", - "tags": [ ], + "tags": [ + "ceph-mixin" + ], "templating": { "list": [ { @@ -734,6 +736,46 @@ "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, @@ -744,7 +786,7 @@ "multi": false, "name": "osd_hosts", "options": [ ], - "query": "label_values(ceph_disk_occupation, exported_instance)", + "query": "label_values(ceph_disk_occupation{job=~\"$job\", cluster=~\"$cluster\"}, exported_instance)", "refresh": 1, "regex": "([^.]*).*", "sort": 1, @@ -764,7 +806,7 @@ "multi": false, "name": "mon_hosts", "options": [ ], - "query": "label_values(ceph_mon_metadata, ceph_daemon)", + "query": "label_values(ceph_mon_metadata{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, "regex": "mon.(.*)", "sort": 1, @@ -784,7 +826,7 @@ "multi": false, "name": "mds_hosts", "options": [ ], - "query": "label_values(ceph_mds_inodes, ceph_daemon)", + "query": "label_values(ceph_mds_inodes{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, "regex": "mds.(.*)", "sort": 1, @@ -804,7 +846,7 @@ "multi": false, "name": "rgw_hosts", "options": [ ], - "query": "label_values(ceph_rgw_metadata, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, "regex": "rgw.(.*)", "sort": 1, diff --git a/monitoring/ceph-mixin/dashboards_out/osd-device-details.json b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json index c82a59dfd05b8..8406b3451832e 100644 --- a/monitoring/ceph-mixin/dashboards_out/osd-device-details.json +++ b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json @@ -104,14 +104,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_osd_op_r_latency_sum{ceph_daemon=~\"$osd\"}[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])\n", + "expr": "rate(ceph_osd_op_r_latency_sum{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "read", "refId": "A" }, { - "expr": "irate(ceph_osd_op_w_latency_sum{ceph_daemon=~\"$osd\"}[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])\n", + "expr": "rate(ceph_osd_op_w_latency_sum{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "write", @@ -202,14 +202,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_osd_op_r{ceph_daemon=~\"$osd\"}[1m])", + "expr": "rate(ceph_osd_op_r{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Reads", "refId": "A" }, { - "expr": "irate(ceph_osd_op_w{ceph_daemon=~\"$osd\"}[1m])", + "expr": "rate(ceph_osd_op_w{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Writes", @@ -300,14 +300,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_osd_op_r_out_bytes{ceph_daemon=~\"$osd\"}[1m])", + "expr": "rate(ceph_osd_op_r_out_bytes{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Read Bytes", "refId": "A" }, { - "expr": "irate(ceph_osd_op_w_in_bytes{ceph_daemon=~\"$osd\"}[1m])", + "expr": "rate(ceph_osd_op_w_in_bytes{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Write Bytes", @@ -417,14 +417,14 @@ "steppedLine": false, "targets": [ { - "expr": "(\n label_replace(\n irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n ) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n)\n", + "expr": "(\n label_replace(\n rate(node_disk_read_time_seconds_total{cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(node_disk_reads_completed_total{cluster=~\"$cluster\"}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n ) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}/{{device}} Reads", "refId": "A" }, { - "expr": "(\n label_replace(\n irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device)\n label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n )\n", + "expr": "(\n label_replace(\n rate(node_disk_write_time_seconds_total{cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(node_disk_writes_completed_total{cluster=~\"$cluster\"}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device)\n label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n )\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}/{{device}} Writes", @@ -515,14 +515,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n irate(node_disk_writes_completed_total[1m]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n rate(node_disk_writes_completed_total{cluster=~\"$cluster\"}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}} on {{instance}} Writes", "refId": "A" }, { - "expr": "label_replace(\n irate(node_disk_reads_completed_total[1m]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n rate(node_disk_reads_completed_total{cluster=~\"$cluster\"}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}} on {{instance}} Reads", @@ -613,14 +613,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n irate(node_disk_read_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n rate(node_disk_read_bytes_total{cluster=~\"$cluster\"}[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}} {{device}} Reads", "refId": "A" }, { - "expr": "label_replace(\n irate(node_disk_written_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n rate(node_disk_written_bytes_total{cluster=~\"$cluster\"}[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}} {{device}} Writes", @@ -706,7 +706,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n irate(node_disk_io_time_seconds_total[1m]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "expr": "label_replace(\n rate(node_disk_io_time_seconds_total{cluster=~\"$cluster\"}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}} on {{instance}}", @@ -754,7 +754,9 @@ "rows": [ ], "schemaVersion": 16, "style": "dark", - "tags": [ ], + "tags": [ + "ceph-mixin" + ], "templating": { "list": [ { @@ -771,6 +773,46 @@ "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, @@ -781,7 +823,7 @@ "multi": false, "name": "osd", "options": [ ], - "query": "label_values(ceph_osd_metadata,ceph_daemon)", + "query": "label_values(ceph_osd_metadata{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, "regex": "(.*)", "sort": 1, diff --git a/monitoring/ceph-mixin/dashboards_out/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json index 54db59b45022d..16e94ec8851de 100644 --- a/monitoring/ceph-mixin/dashboards_out/osds-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json @@ -94,21 +94,21 @@ "steppedLine": false, "targets": [ { - "expr": "avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)", + "expr": "avg (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) * 1000\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "AVG read", "refId": "A" }, { - "expr": "max(\n irate(ceph_osd_op_r_latency_sum[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000\n)\n", + "expr": "max(\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) * 1000\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "MAX read", "refId": "B" }, { - "expr": "quantile(0.95,\n (\n irate(ceph_osd_op_r_latency_sum[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])\n * 1000\n )\n)\n", + "expr": "quantile(0.95,\n (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n * 1000\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "@95%ile", @@ -222,7 +222,7 @@ ], "targets": [ { - "expr": "topk(10,\n (sort(\n (\n irate(ceph_osd_op_r_latency_sum[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) *\n 1000\n )\n ))\n)\n", + "expr": "topk(10,\n (sort(\n (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n 1000\n )\n ))\n)\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -281,21 +281,21 @@ "steppedLine": false, "targets": [ { - "expr": "avg(\n irate(ceph_osd_op_w_latency_sum[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])\n * 1000\n)\n", + "expr": "avg(\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n * 1000\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "AVG write", "refId": "A" }, { - "expr": "max(\n irate(ceph_osd_op_w_latency_sum[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) *\n 1000\n)\n", + "expr": "max(\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n 1000\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "MAX write", "refId": "B" }, { - "expr": "quantile(0.95, (\n irate(ceph_osd_op_w_latency_sum[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) *\n 1000\n))\n", + "expr": "quantile(0.95, (\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n 1000\n))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "@95%ile write", @@ -409,7 +409,7 @@ ], "targets": [ { - "expr": "topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) /\n on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) *\n 1000)\n ))\n)\n", + "expr": "topk(10,\n (sort(\n (rate(ceph_osd_op_w_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n 1000)\n ))\n)\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -443,7 +443,7 @@ "pieType": "pie", "targets": [ { - "expr": "count by (device_class) (ceph_osd_metadata)", + "expr": "count by (device_class) (ceph_osd_metadata{job=~\"$job\", cluster=~\"$cluster\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device_class}}", @@ -476,14 +476,14 @@ "pieType": "pie", "targets": [ { - "expr": "count(ceph_bluefs_wal_total_bytes)", + "expr": "count(ceph_bluefs_wal_total_bytes{job=~\"$job\", cluster=~\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "bluestore", "refId": "A" }, { - "expr": "absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)", + "expr": "absent(ceph_bluefs_wal_total_bytesjob=~\"$job\", cluster=~\"$cluster\") * count(ceph_osd_metadata{job=~\"$job\", cluster=~\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "filestore", @@ -514,63 +514,63 @@ "pieType": "pie", "targets": [ { - "expr": "count(ceph_osd_stat_bytes < 1099511627776)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} < 1099511627776)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<1TB", "refId": "A" }, { - "expr": "count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 1099511627776 < 2199023255552)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<2TB", "refId": "B" }, { - "expr": "count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 2199023255552 < 3298534883328)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<3TB", "refId": "C" }, { - "expr": "count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 3298534883328 < 4398046511104)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<4TB", "refId": "D" }, { - "expr": "count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 4398046511104 < 6597069766656)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<6TB", "refId": "E" }, { - "expr": "count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 6597069766656 < 8796093022208)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<8TB", "refId": "F" }, { - "expr": "count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 8796093022208 < 10995116277760)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<10TB", "refId": "G" }, { - "expr": "count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 10995116277760 < 13194139533312)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<12TB", "refId": "H" }, { - "expr": "count(ceph_osd_stat_bytes >= 13194139533312)", + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"} >= 13194139533312)", "format": "time_series", "intervalFactor": 2, "legendFormat": "<12TB+", @@ -623,7 +623,7 @@ "steppedLine": false, "targets": [ { - "expr": "ceph_osd_numpg", + "expr": "ceph_osd_numpg{job=~\"$job\", cluster=~\"$cluster\"}", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -728,7 +728,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ceph_bluestore_onode_hits) / (\n sum(ceph_bluestore_onode_hits) +\n sum(ceph_bluestore_onode_misses)\n)\n", + "expr": "sum(ceph_bluestore_onode_hits{job=~\"$job\", cluster=~\"$cluster\"}) / (\n sum(ceph_bluestore_onode_hits{job=~\"$job\", cluster=~\"$cluster\"}) +\n sum(ceph_bluestore_onode_misses{job=~\"$job\", cluster=~\"$cluster\"})\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -810,14 +810,14 @@ "steppedLine": false, "targets": [ { - "expr": "round(sum(irate(ceph_pool_rd[30s])))", + "expr": "round(sum(rate(ceph_pool_rd{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Reads", "refId": "A" }, { - "expr": "round(sum(irate(ceph_pool_wr[30s])))", + "expr": "round(sum(rate(ceph_pool_wr{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Writes", @@ -865,7 +865,9 @@ "rows": [ ], "schemaVersion": 16, "style": "dark", - "tags": [ ], + "tags": [ + "ceph-mixin" + ], "templating": { "list": [ { @@ -881,6 +883,46 @@ "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, diff --git a/monitoring/ceph-mixin/dashboards_out/pool-detail.json b/monitoring/ceph-mixin/dashboards_out/pool-detail.json index 26ce7bf41cdc0..216c09ed014e3 100644 --- a/monitoring/ceph-mixin/dashboards_out/pool-detail.json +++ b/monitoring/ceph-mixin/dashboards_out/pool-detail.json @@ -104,7 +104,7 @@ "tableColumn": "", "targets": [ { - "expr": "(ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}\n", + "expr": "(ceph_pool_stored{job=~\"$job\", cluster=~\"$cluster\"} / (ceph_pool_stored{job=~\"$job\", cluster=~\"$cluster\"} + ceph_pool_max_avail{job=~\"$job\", cluster=~\"$cluster\"})) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -134,7 +134,7 @@ "#d44a3a" ], "datasource": "$datasource", - "description": "Time till pool is full assuming the average fill rate of the last 6 hours", + "description": "Time till pool is full assuming the average fill rate of the last 4 hours", "format": "s", "gauge": { "maxValue": false, @@ -186,7 +186,7 @@ "tableColumn": "", "targets": [ { - "expr": "(ceph_pool_max_avail / deriv(ceph_pool_stored[6h])) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"} > 0\n", + "expr": "(ceph_pool_max_avail{job=~\"$job\", cluster=~\"$cluster\"} / deriv(ceph_pool_stored{job=~\"$job\", cluster=~\"$cluster\"}[6h])) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"} > 0\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -252,7 +252,7 @@ "steppedLine": false, "targets": [ { - "expr": "deriv(ceph_pool_objects[1m]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}\n", + "expr": "deriv(ceph_pool_objects{job=~\"$job\", cluster=~\"$cluster\"}[1m]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Objects per second", @@ -346,14 +346,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_pool_rd[1m]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}\n", + "expr": "rate(ceph_pool_rd{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "reads", "refId": "A" }, { - "expr": "irate(ceph_pool_wr[1m]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}\n", + "expr": "rate(ceph_pool_wr{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "writes", @@ -447,14 +447,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_pool_rd_bytes[1m]) +\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\"}\n", + "expr": "rate(ceph_pool_rd_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "reads", "refId": "A" }, { - "expr": "irate(ceph_pool_wr_bytes[1m]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}\n", + "expr": "rate(ceph_pool_wr_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "writes", @@ -543,7 +543,7 @@ "steppedLine": false, "targets": [ { - "expr": "ceph_pool_objects *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\"}\n", + "expr": "ceph_pool_objects{job=~\"$job\", cluster=~\"$cluster\"} *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", name=~\"$pool_name\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Number of Objects", @@ -591,13 +591,15 @@ "rows": [ ], "schemaVersion": 22, "style": "dark", - "tags": [ ], + "tags": [ + "ceph-mixin" + ], "templating": { "list": [ { "current": { - "text": "Prometheus admin.virt1.home.fajerski.name:9090", - "value": "Prometheus admin.virt1.home.fajerski.name:9090" + "text": "default", + "value": "default" }, "hide": 0, "label": "Data Source", @@ -608,6 +610,46 @@ "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, @@ -618,7 +660,7 @@ "multi": false, "name": "pool_name", "options": [ ], - "query": "label_values(ceph_pool_metadata,name)", + "query": "label_values(ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}, name)", "refresh": 1, "regex": "", "sort": 1, diff --git a/monitoring/ceph-mixin/dashboards_out/pool-overview.json b/monitoring/ceph-mixin/dashboards_out/pool-overview.json index ecd77749d0571..bc2a49fd9cb0f 100644 --- a/monitoring/ceph-mixin/dashboards_out/pool-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/pool-overview.json @@ -85,7 +85,7 @@ "tableColumn": "", "targets": [ { - "expr": "count(ceph_pool_metadata)", + "expr": "count(ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"})", "format": "table", "instant": true, "intervalFactor": 1, @@ -168,7 +168,7 @@ "tableColumn": "", "targets": [ { - "expr": "count(ceph_pool_metadata{compression_mode!=\"none\"})", + "expr": "count(ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", compression_mode!=\"none\"})", "format": "", "intervalFactor": 1, "legendFormat": "", @@ -250,7 +250,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ceph_osd_stat_bytes)", + "expr": "sum(ceph_osd_stat_bytes{job=~\"$job\", cluster=~\"$cluster\"})", "format": "", "intervalFactor": 1, "legendFormat": "", @@ -332,7 +332,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ceph_pool_bytes_used)", + "expr": "sum(ceph_pool_bytes_used{job=~\"$job\", cluster=~\"$cluster\"})", "format": "", "instant": true, "intervalFactor": 1, @@ -415,7 +415,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ceph_pool_stored)", + "expr": "sum(ceph_pool_stored{job=~\"$job\", cluster=~\"$cluster\"})", "format": "", "instant": true, "intervalFactor": 1, @@ -498,7 +498,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used)", + "expr": "sum(\n ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} -\n ceph_pool_compress_bytes_used{job=~\"$job\", cluster=~\"$cluster\"}\n)\n", "format": "", "intervalFactor": 1, "legendFormat": "", @@ -580,7 +580,7 @@ "tableColumn": "", "targets": [ { - "expr": "(\n sum(ceph_pool_compress_under_bytes > 0) /\n sum(ceph_pool_stored_raw and ceph_pool_compress_under_bytes > 0)\n) * 100\n", + "expr": "(\n sum(ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} > 0) /\n sum(ceph_pool_stored_raw{job=~\"$job\", cluster=~\"$cluster\"} and ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} > 0)\n) * 100\n", "format": "table", "intervalFactor": 1, "legendFormat": "", @@ -662,7 +662,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_compress_bytes_used > 0)", + "expr": "sum(\n ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} > 0)\n / sum(ceph_pool_compress_bytes_used{job=~\"$job\", cluster=~\"$cluster\"} > 0\n)\n", "format": "", "intervalFactor": 1, "legendFormat": "", @@ -1053,7 +1053,7 @@ ], "targets": [ { - "expr": "(ceph_pool_compress_under_bytes / ceph_pool_compress_bytes_used > 0) and on(pool_id) (\n ((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100 > 0.5\n)\n", + "expr": "(\n ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} /\n ceph_pool_compress_bytes_used{job=~\"$job\", cluster=~\"$cluster\"} > 0\n) and on(pool_id) (\n (\n (ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} > 0) /\n ceph_pool_stored_raw{job=~\"$job\", cluster=~\"$cluster\"}\n ) * 100 > 0.5\n)\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -1061,7 +1061,7 @@ "refId": "A" }, { - "expr": "ceph_pool_max_avail * on(pool_id) group_left(name) ceph_pool_metadata", + "expr": "ceph_pool_max_avail{job=~\"$job\", cluster=~\"$cluster\"} *\n on(pool_id) group_left(name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -1069,7 +1069,7 @@ "refId": "B" }, { - "expr": "((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100", + "expr": "(\n (ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} > 0) /\n ceph_pool_stored_raw{job=~\"$job\", cluster=~\"$cluster\"}\n) * 100\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -1077,7 +1077,7 @@ "refId": "C" }, { - "expr": "(ceph_pool_percent_used * on(pool_id) group_left(name) ceph_pool_metadata)", + "expr": "ceph_pool_percent_used{job=~\"$job\", cluster=~\"$cluster\"} *\n on(pool_id) group_left(name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -1085,7 +1085,7 @@ "refId": "D" }, { - "expr": "(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used > 0)", + "expr": "ceph_pool_compress_under_bytes{job=~\"$job\", cluster=~\"$cluster\"} -\n ceph_pool_compress_bytes_used{job=~\"$job\", cluster=~\"$cluster\"} > 0\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -1093,7 +1093,7 @@ "refId": "E" }, { - "expr": "delta(ceph_pool_stored[5d])", + "expr": "delta(ceph_pool_stored{job=~\"$job\", cluster=~\"$cluster\"}[5d])", "format": "table", "instant": true, "intervalFactor": 1, @@ -1101,7 +1101,7 @@ "refId": "F" }, { - "expr": "rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])", + "expr": "rate(ceph_pool_rd{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n + rate(ceph_pool_wr{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -1109,7 +1109,7 @@ "refId": "G" }, { - "expr": "rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])", + "expr": "rate(ceph_pool_rd_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -1117,7 +1117,7 @@ "refId": "H" }, { - "expr": "ceph_pool_metadata", + "expr": "ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}", "format": "table", "instant": true, "intervalFactor": 1, @@ -1125,7 +1125,7 @@ "refId": "I" }, { - "expr": "ceph_pool_stored * on(pool_id) group_left ceph_pool_metadata", + "expr": "ceph_pool_stored{job=~\"$job\", cluster=~\"$cluster\"} * on(pool_id) group_left ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}", "format": "table", "instant": true, "intervalFactor": 1, @@ -1133,7 +1133,7 @@ "refId": "J" }, { - "expr": "ceph_pool_metadata{compression_mode!=\"none\"}", + "expr": "ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\", compression_mode!=\"none\"}", "format": "table", "instant": true, "intervalFactor": 1, @@ -1197,14 +1197,14 @@ "steppedLine": false, "targets": [ { - "expr": "topk($topk,\n round(\n (rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])),\n 1\n ) * on(pool_id) group_left(instance,name) ceph_pool_metadata)\n", + "expr": "topk($topk,\n round(\n (\n rate(ceph_pool_rd{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(ceph_pool_wr{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n ), 1\n ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"})\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{name}} ", "refId": "A" }, { - "expr": "topk($topk,\n rate(ceph_pool_wr[30s]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata\n)\n", + "expr": "topk($topk,\n rate(ceph_pool_wr{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{name}} - write", @@ -1290,7 +1290,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk($topk,\n (rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata\n)\n", + "expr": "topk($topk,\n (\n rate(ceph_pool_rd_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{name}}", @@ -1376,7 +1376,7 @@ "steppedLine": false, "targets": [ { - "expr": "ceph_pool_bytes_used * on(pool_id) group_right ceph_pool_metadata", + "expr": "ceph_pool_bytes_used{job=~\"$job\", cluster=~\"$cluster\"} * on(pool_id) group_right ceph_pool_metadata{job=~\"$job\", cluster=~\"$cluster\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{name}}", @@ -1424,13 +1424,15 @@ "rows": [ ], "schemaVersion": 22, "style": "dark", - "tags": [ ], + "tags": [ + "ceph-mixin" + ], "templating": { "list": [ { "current": { - "text": "Dashboard1", - "value": "Dashboard1" + "text": "default", + "value": "default" }, "hide": 0, "label": "Data Source", @@ -1441,6 +1443,46 @@ "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json index 2bd8ac4055fa0..ea45b685a1f49 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json @@ -105,14 +105,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (instance_id) (\n rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "sum by (instance_id) (\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET {{ceph_daemon}}", "refId": "A" }, { - "expr": "sum by (instance_id) (\n rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "sum by (instance_id) (\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT {{ceph_daemon}}", @@ -198,14 +198,14 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_get_b[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_get_b{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_put_b[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_put_b{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", @@ -297,28 +297,28 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_failed_req[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_failed_req{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\",ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Requests Failed {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_get[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_get{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "B" }, { - "expr": "rate(ceph_rgw_put[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_put{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", "refId": "C" }, { - "expr": "(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "(\n rate(ceph_rgw_req{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) -\n (\n rate(ceph_rgw_get{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(ceph_rgw_put{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Other {{ceph_daemon}}", @@ -387,28 +387,28 @@ "pieType": "pie", "targets": [ { - "expr": "rate(ceph_rgw_failed_req[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_failed_req{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Failures {{ceph_daemon}}", "refId": "A" }, { - "expr": "rate(ceph_rgw_get[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_get{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs {{ceph_daemon}}", "refId": "B" }, { - "expr": "rate(ceph_rgw_put[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "rate(ceph_rgw_put{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs {{ceph_daemon}}", "refId": "C" }, { - "expr": "(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\"}\n", + "expr": "(\n rate(ceph_rgw_req{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) -\n (\n rate(ceph_rgw_get{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(ceph_rgw_put{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\", ceph_daemon=~\"$rgw_servers\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}", @@ -425,6 +425,7 @@ "schemaVersion": 16, "style": "dark", "tags": [ + "ceph-mixin", "overview" ], "templating": { @@ -443,6 +444,46 @@ "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, @@ -453,7 +494,7 @@ "multi": false, "name": "rgw_servers", "options": [ ], - "query": "label_values(ceph_rgw_metadata, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, "regex": "", "sort": 1, diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 1c93adc4697dd..7ba8861b47e7e 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -99,14 +99,14 @@ "steppedLine": false, "targets": [ { - "expr": "rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata\n", + "expr": "rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "GET AVG", "refId": "A" }, { - "expr": "rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata\n", + "expr": "rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"}\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUT AVG", @@ -192,7 +192,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", + "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -278,7 +278,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -364,14 +364,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ceph_rgw_get_b[30s]))", + "expr": "sum(rate(ceph_rgw_get_b{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "GETs", "refId": "A" }, { - "expr": "sum(rate(ceph_rgw_put_b[30s]))", + "expr": "sum(rate(ceph_rgw_put_b{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "PUTs", @@ -457,7 +457,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) +\n rate(ceph_rgw_put_b[30s])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(ceph_rgw_put_b{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -543,7 +543,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{rgw_host}}", @@ -673,14 +673,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n irate(\n haproxy_frontend_http_responses_total{code=~\"$code\", instance=~\"$ingress_service\", proxy=~\"frontend\"}[5m]\n )\n) by (code)\n", + "expr": "sum(\n rate(\n haproxy_frontend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"frontend\"}[$__rate_interval]\n )\n) by (code)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Frontend {{ code }}", "refId": "A" }, { - "expr": "sum(\n irate(\n haproxy_backend_http_responses_total{code=~\"$code\", instance=~\"$ingress_service\", proxy=~\"backend\"}[5m]\n )\n) by (code)\n", + "expr": "sum(\n rate(\n haproxy_backend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"backend\"}[$__rate_interval]\n )\n) by (code)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Backend {{ code }}", @@ -777,49 +777,49 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n irate(\n haproxy_frontend_http_requests_total{proxy=~\"frontend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_frontend_http_requests_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Requests", "refId": "A" }, { - "expr": "sum(\n irate(\n haproxy_backend_response_errors_total{proxy=~\"backend\",instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_backend_response_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "Response errors", "refId": "B" }, { - "expr": "sum(\n irate(\n haproxy_frontend_request_errors_total{proxy=~\"frontend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_frontend_request_errors_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Requests errors", "refId": "C" }, { - "expr": "sum(\n irate(\n haproxy_backend_redispatch_warnings_total{proxy=~\"backend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_backend_redispatch_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "Backend redispatch", "refId": "D" }, { - "expr": "sum(\n irate(\n haproxy_backend_retry_warnings_total{proxy=~\"backend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_backend_retry_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "Backend retry", "refId": "E" }, { - "expr": "sum(\n irate(\n haproxy_frontend_requests_denied_total{proxy=~\"frontend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_frontend_requests_denied_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "Request denied", "refId": "F" }, { - "expr": "sum(\n haproxy_backend_current_queue{proxy=~\"backend\", instance=~\"$ingress_service\"}\n) by (instance)\n", + "expr": "sum(\n haproxy_backend_current_queue{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "Backend Queued", @@ -912,21 +912,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n irate(\n haproxy_frontend_connections_total{proxy=~\"frontend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_frontend_connections_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Front", "refId": "A" }, { - "expr": "sum(\n irate(\n haproxy_backend_connection_attempts_total{proxy=~\"backend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_backend_connection_attempts_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Back", "refId": "B" }, { - "expr": "sum(\n irate(\n haproxy_backend_connection_errors_total{proxy=~\"backend\", instance=~\"$ingress_service\"}[5m]\n )\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_backend_connection_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Back errors", @@ -1019,28 +1019,28 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n irate(\n haproxy_frontend_bytes_in_total{proxy=~\"frontend\", instance=~\"$ingress_service\"}[5m]\n ) * 8\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_frontend_bytes_in_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "IN Front", "refId": "A" }, { - "expr": "sum(\n irate(\n haproxy_frontend_bytes_out_total{proxy=~\"frontend\", instance=~\"$ingress_service\"}[5m]\n ) * 8\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_frontend_bytes_out_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "OUT Front", "refId": "B" }, { - "expr": "sum(\n irate(\n haproxy_backend_bytes_in_total{proxy=~\"backend\", instance=~\"$ingress_service\"}[5m]\n ) * 8\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_backend_bytes_in_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "IN Back", "refId": "C" }, { - "expr": "sum(\n irate(\n haproxy_backend_bytes_out_total{proxy=~\"backend\", instance=~\"$ingress_service\"}[5m]\n ) * 8\n) by (instance)\n", + "expr": "sum(\n rate(\n haproxy_backend_bytes_out_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "OUT Back", @@ -1089,10 +1089,65 @@ "schemaVersion": 16, "style": "dark", "tags": [ + "ceph-mixin", "overview" ], "templating": { "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, @@ -1103,9 +1158,9 @@ "multi": false, "name": "rgw_servers", "options": [ ], - "query": "label_values(ceph_rgw_metadata, ceph_daemon)", + "query": "label_values(ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, - "regex": "", + "regex": "RGW Server", "sort": 1, "tagValuesQuery": "", "tags": [ ], @@ -1123,7 +1178,7 @@ "multi": false, "name": "code", "options": [ ], - "query": "label_values(haproxy_server_http_responses_total{instance=~\"$ingress_service\"}, code)", + "query": "label_values(haproxy_server_http_responses_total{job=~\"$job_haproxy\", instance=~\"$ingress_service\"}, code)", "refresh": 1, "regex": "", "sort": 1, @@ -1134,18 +1189,18 @@ "useTags": false }, { - "allValue": null, + "allValue": ".+", "current": { }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "Ingress Service", - "multi": false, - "name": "ingress_service", + "label": "job haproxy", + "multi": true, + "name": "job_haproxy", "options": [ ], - "query": "label_values(haproxy_server_status, instance)", + "query": "label_values(haproxy_server_status, job)", "refresh": 1, - "regex": "", + "regex": "(.*)", "sort": 1, "tagValuesQuery": "", "tags": [ ], @@ -1154,18 +1209,24 @@ "useTags": false }, { - "current": { - "text": "default", - "value": "default" - }, + "allValue": null, + "current": { }, + "datasource": "$datasource", "hide": 0, - "label": "Data Source", - "name": "datasource", + "includeAll": true, + "label": "Ingress Service", + "multi": false, + "name": "ingress_service", "options": [ ], - "query": "prometheus", + "query": "label_values(haproxy_server_status{job=~\"$job_haproxy\"}, instance)", "refresh": 1, "regex": "", - "type": "datasource" + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, diff --git a/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json index 232242acc5860..8cbab9a0c15d4 100644 --- a/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json @@ -80,7 +80,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_sum[30s]))", + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{source_zone}}", @@ -166,7 +166,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_count[30s]))", + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{source_zone}}", @@ -252,7 +252,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_poll_latency_sum[30s]))", + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_poll_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{source_zone}}", @@ -338,7 +338,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_errors[30s]))", + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_errors{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{source_zone}}", @@ -387,23 +387,58 @@ "schemaVersion": 16, "style": "dark", "tags": [ + "ceph-mixin", "overview" ], "templating": { "list": [ { - "allValue": null, + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", "current": { }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "", - "multi": false, - "name": "rgw_servers", + "label": "job", + "multi": true, + "name": "job", "options": [ ], - "query": "prometehus", + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", "refresh": 1, - "regex": "", + "regex": "(.*)", "sort": 1, "tagValuesQuery": "", "tags": [ ], @@ -412,18 +447,24 @@ "useTags": false }, { - "current": { - "text": "default", - "value": "default" - }, + "allValue": null, + "current": { }, + "datasource": "$datasource", "hide": 0, - "label": "Data Source", - "name": "datasource", + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", "options": [ ], - "query": "prometheus", + "query": "label_values(ceph_rgw_metadata{job=~\"$job\", cluster=~\"$cluster\"}, ceph_daemon)", "refresh": 1, - "regex": "", - "type": "datasource" + "regex": "RGW Server", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, diff --git a/monitoring/ceph-mixin/dashboards_out/rbd-details.json b/monitoring/ceph-mixin/dashboards_out/rbd-details.json index 1f03187260003..cb25f8ec89b32 100644 --- a/monitoring/ceph-mixin/dashboards_out/rbd-details.json +++ b/monitoring/ceph-mixin/dashboards_out/rbd-details.json @@ -42,7 +42,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$Datasource", + "datasource": "$datasource", "description": "", "fill": 1, "fillGradient": 0, @@ -80,14 +80,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_rbd_write_ops{pool=\"$Pool\", image=\"$Image\"}[30s])", + "expr": "rate(ceph_rbd_write_ops{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pool}} Write", "refId": "A" }, { - "expr": "irate(ceph_rbd_read_ops{pool=\"$Pool\", image=\"$Image\"}[30s])", + "expr": "rate(ceph_rbd_read_ops{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pool}} Read", @@ -135,7 +135,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$Datasource", + "datasource": "$datasource", "description": "", "fill": 1, "fillGradient": 0, @@ -173,14 +173,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_rbd_write_bytes{pool=\"$Pool\", image=\"$Image\"}[30s])", + "expr": "rate(ceph_rbd_write_bytes{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pool}} Write", "refId": "A" }, { - "expr": "irate(ceph_rbd_read_bytes{pool=\"$Pool\", image=\"$Image\"}[30s])", + "expr": "rate(ceph_rbd_read_bytes{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pool}} Read", @@ -228,7 +228,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$Datasource", + "datasource": "$datasource", "description": "", "fill": 1, "fillGradient": 0, @@ -266,14 +266,14 @@ "steppedLine": false, "targets": [ { - "expr": "irate(ceph_rbd_write_latency_sum{pool=\"$Pool\", image=\"$Image\"}[30s]) /\n irate(ceph_rbd_write_latency_count{pool=\"$Pool\", image=\"$Image\"}[30s])\n", + "expr": "rate(ceph_rbd_write_latency_sum{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval]) /\n rate(ceph_rbd_write_latency_count{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pool}} Write", "refId": "A" }, { - "expr": "irate(ceph_rbd_read_latency_sum{pool=\"$Pool\", image=\"$Image\"}[30s]) /\n irate(ceph_rbd_read_latency_count{pool=\"$Pool\", image=\"$Image\"}[30s])\n", + "expr": "rate(ceph_rbd_read_latency_sum{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval]) /\n rate(ceph_rbd_read_latency_count{job=~\"$job\", cluster=~\"$cluster\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pool}} Read", @@ -321,7 +321,9 @@ "rows": [ ], "schemaVersion": 16, "style": "dark", - "tags": [ ], + "tags": [ + "ceph-mixin" + ], "templating": { "list": [ { @@ -330,23 +332,63 @@ "value": "default" }, "hide": 0, - "label": null, - "name": "Datasource", + "label": "Data Source", + "name": "datasource", "options": [ ], "query": "prometheus", "refresh": 1, "regex": "", "type": "datasource" }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { }, - "datasource": "$Datasource", + "datasource": "$datasource", "hide": 0, "includeAll": false, "label": "", "multi": false, - "name": "Pool", + "name": "pool", "options": [ ], "query": "label_values(pool)", "refresh": 1, @@ -361,12 +403,12 @@ { "allValue": null, "current": { }, - "datasource": "$Datasource", + "datasource": "$datasource", "hide": 0, "includeAll": false, "label": "", "multi": false, - "name": "Image", + "name": "image", "options": [ ], "query": "label_values(image)", "refresh": 1, diff --git a/monitoring/ceph-mixin/dashboards_out/rbd-overview.json b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json index 0505437648772..10facbee58cce 100644 --- a/monitoring/ceph-mixin/dashboards_out/rbd-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json @@ -92,14 +92,14 @@ "steppedLine": false, "targets": [ { - "expr": "round(sum(irate(ceph_rbd_write_ops[30s])))", + "expr": "round(sum(rate(ceph_rbd_write_ops{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Writes", "refId": "A" }, { - "expr": "round(sum(irate(ceph_rbd_read_ops[30s])))", + "expr": "round(sum(rate(ceph_rbd_read_ops{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Reads", @@ -185,14 +185,14 @@ "steppedLine": false, "targets": [ { - "expr": "round(sum(irate(ceph_rbd_write_bytes[30s])))", + "expr": "round(sum(rate(ceph_rbd_write_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Write", "refId": "A" }, { - "expr": "round(sum(irate(ceph_rbd_read_bytes[30s])))", + "expr": "round(sum(rate(ceph_rbd_read_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Read", @@ -278,14 +278,14 @@ "steppedLine": false, "targets": [ { - "expr": "round(\n sum(irate(ceph_rbd_write_latency_sum[30s])) /\n sum(irate(ceph_rbd_write_latency_count[30s]))\n)\n", + "expr": "round(\n sum(rate(ceph_rbd_write_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])) /\n sum(rate(ceph_rbd_write_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Write", "refId": "A" }, { - "expr": "round(\n sum(irate(ceph_rbd_read_latency_sum[30s])) /\n sum(irate(ceph_rbd_read_latency_count[30s]))\n)\n", + "expr": "round(\n sum(rate(ceph_rbd_read_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])) /\n sum(rate(ceph_rbd_read_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Read", @@ -416,7 +416,7 @@ ], "targets": [ { - "expr": "topk(10,\n (\n sort((\n irate(ceph_rbd_write_ops[30s]) +\n on (image, pool, namespace) irate(ceph_rbd_read_ops[30s])\n ))\n )\n)\n", + "expr": "topk(10,\n (\n sort((\n rate(ceph_rbd_write_ops{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n on (image, pool, namespace) rate(ceph_rbd_read_ops{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n ))\n )\n)\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -518,7 +518,7 @@ ], "targets": [ { - "expr": "topk(10,\n sort(\n sum(\n irate(ceph_rbd_read_bytes[30s]) + irate(ceph_rbd_write_bytes[30s])\n ) by (pool, image, namespace)\n )\n)\n", + "expr": "topk(10,\n sort(\n sum(\n rate(ceph_rbd_read_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(ceph_rbd_write_bytes{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval])\n ) by (pool, image, namespace)\n )\n)\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -620,7 +620,7 @@ ], "targets": [ { - "expr": "topk(10,\n sum(\n irate(ceph_rbd_write_latency_sum[30s]) / clamp_min(irate(ceph_rbd_write_latency_count[30s]), 1) +\n irate(ceph_rbd_read_latency_sum[30s]) / clamp_min(irate(ceph_rbd_read_latency_count[30s]), 1)\n ) by (pool, image, namespace)\n)\n", + "expr": "topk(10,\n sum(\n rate(ceph_rbd_write_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_write_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]), 1) +\n rate(ceph_rbd_read_latency_sum{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_read_latency_count{job=~\"$job\", cluster=~\"$cluster\"}[$__rate_interval]), 1)\n ) by (pool, image, namespace)\n)\n", "format": "table", "instant": true, "intervalFactor": 1, @@ -640,6 +640,7 @@ "schemaVersion": 16, "style": "dark", "tags": [ + "ceph-mixin", "overview" ], "templating": { @@ -657,6 +658,46 @@ "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\"}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, -- 2.39.5