::
- ceph_disk_occupation{ceph_daemon="osd.0",device="sdd", exported_instance="myhost"}
+ ceph_disk_occupation_human{ceph_daemon="osd.0", device="sdd", exported_instance="myhost"}
To use this to get disk statistics by OSD ID, use either the ``and`` operator or
the ``*`` operator in your prometheus query. All metadata metrics (like ``
-ceph_disk_occupation`` have the value 1 so they act neutral with ``*``. Using ``*``
+ceph_disk_occupation_human`` have the value 1 so they act neutral with ``*``. Using ``*``
allows to use ``group_left`` and ``group_right`` grouping modifiers, so that
the resulting metric has additional labels from one side of the query.
::
- rate(node_disk_bytes_written[30s]) and on (device,instance) ceph_disk_occupation{ceph_daemon="osd.0"}
+ rate(node_disk_bytes_written[30s]) and
+ on (device,instance) ceph_disk_occupation_human{ceph_daemon="osd.0"}
Out of the box the above query will not return any metrics since the ``instance`` labels of
-both metrics don't match. The ``instance`` label of ``ceph_disk_occupation``
+both metrics don't match. The ``instance`` label of ``ceph_disk_occupation_human``
will be the currently active MGR node.
- The following two section outline two approaches to remedy this.
+The following two section outline two approaches to remedy this.
+
+.. note::
+
+ If you need to group on the `ceph_daemon` label instead of `device` and
+ `instance` labels, using `ceph_disk_occupation_human` may not work reliably.
+ It is advised that you use `ceph_disk_occupation` instead.
+
+ The difference is that `ceph_disk_occupation_human` may group several OSDs
+ into the value of a single `ceph_daemon` label in cases where multiple OSDs
+ share a disk.
Use label_replace
=================
::
- label_replace(rate(node_disk_bytes_written[30s]), "exported_instance", "$1", "instance", "(.*):.*") and on (device,exported_instance) ceph_disk_occupation{ceph_daemon="osd.0"}
+ label_replace(
+ rate(node_disk_bytes_written[30s]),
+ "exported_instance",
+ "$1",
+ "instance",
+ "(.*):.*"
+ ) and on (device, exported_instance) ceph_disk_occupation_human{ceph_daemon="osd.0"}
Configuring Prometheus server
=============================
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(\n (\n irate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device, ceph_daemon) group_left\n label_replace(\n label_replace(\n ceph_disk_occupation,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )",
+ "expr": "label_replace(\n (\n irate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}}({{ceph_daemon}}) writes",
"refId": "A"
},
{
- "expr": "label_replace(\n (irate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device, ceph_daemon) group_left\n label_replace(\n label_replace(\n ceph_disk_occupation,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )",
+ "expr": "label_replace(\n (irate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}}({{ceph_daemon}}) reads",
"steppedLine": false,
"targets": [
{
- "expr": "label_replace((irate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace((irate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}}({{ceph_daemon}}) write",
"refId": "A"
},
{
- "expr": "label_replace((irate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace((irate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}}({{ceph_daemon}}) read",
"steppedLine": false,
"targets": [
{
- "expr": "max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")) * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")) * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}}({{ceph_daemon}})",
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(((irate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace(((irate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}}({{ceph_daemon}})",
"tableColumn": "",
"targets": [
{
- "expr": "avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) *\n on(instance, device, ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)",
+ "expr": "avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) *\n on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "",
'AVG Disk Utilization',
'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)',
'current',
- 'avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), "instance", "$1", "instance", "([^.:]*).*"\n ) *\n on(instance, device, ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~"($osd_hosts).*"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^.:]*).*")\n)',
+ 'avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), "instance", "$1", "instance", "([^.:]*).*"\n ) *\n on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($osd_hosts).*"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^.:]*).*")\n)',
'time_series',
16, 0, 4, 5
),
'connected',
'ops',
'Read (-) / Write (+)',
- 'label_replace(\n (\n irate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device, ceph_daemon) group_left\n label_replace(\n label_replace(\n ceph_disk_occupation,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )',
+ 'label_replace(\n (\n irate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )',
'{{device}}({{ceph_daemon}}) writes',
0, 12, 11, 9
)
.addTargets(
[
addTargetSchema(
- 'label_replace(\n (irate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device, ceph_daemon) group_left\n label_replace(\n label_replace(\n ceph_disk_occupation,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )',
+ 'label_replace(\n (irate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )',
1,
'time_series',
'{{device}}({{ceph_daemon}}) reads'
'connected',
'Bps',
'Read (-) / Write (+)',
- 'label_replace((irate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace((irate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
'{{device}}({{ceph_daemon}}) write',
12, 12, 11, 9
)
.addTargets(
[
addTargetSchema(
- 'label_replace((irate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace((irate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
1,
'time_series',
'{{device}}({{ceph_daemon}}) read'
'null as zero',
's',
'',
- 'max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001), "instance", "$1", "instance", "([^:.]*).*")) * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001), "instance", "$1", "instance", "([^:.]*).*")) * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
'{{device}}({{ceph_daemon}})',
0, 21, 11, 9
),
'connected',
'percent',
'%Util',
- 'label_replace(((irate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) * 100), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace(((irate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) * 100), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
'{{device}}({{ceph_daemon}})',
12, 21, 11, 9
)
'',
's',
'Read (-) / Write (+)',
- '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
- '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
+ '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
+ '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
'{{instance}}/{{device}} Reads',
'{{instance}}/{{device}} Writes',
0, 11, 6, 9
'',
'short',
'Read (-) / Write (+)',
- 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
- 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
'{{device}} on {{instance}} Writes',
'{{device}} on {{instance}} Reads',
6, 11, 6, 9
'',
'Bps',
'Read (-) / Write (+)',
- 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
- 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
'{{instance}} {{device}} Reads',
'{{instance}} {{device}} Writes',
12, 11, 6, 9
)
.addTarget(
addTargetSchema(
- 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
+ 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
1,
'time_series',
'{{device}} on {{instance}}'
"steppedLine": false,
"targets": [
{
- "expr": "(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))",
+ "expr": "(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}}/{{device}} Reads",
"refId": "A"
},
{
- "expr": "(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))",
+ "expr": "(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}}/{{device}} Writes",
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace(irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}} on {{instance}} Writes",
"refId": "A"
},
{
- "expr": "label_replace(irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace(irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}} on {{instance}} Reads",
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(irate(node_disk_read_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace(irate(node_disk_read_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}} {{device}} Reads",
"refId": "A"
},
{
- "expr": "label_replace(irate(node_disk_written_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace(irate(node_disk_written_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}} {{device}} Writes",
"steppedLine": false,
"targets": [
{
- "expr": "label_replace(irate(node_disk_io_time_seconds_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+ "expr": "label_replace(irate(node_disk_io_time_seconds_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{device}} on {{instance}}",
--- /dev/null
+Feature: Host Details Dashboard
+
+Scenario: "Test OSD"
+ Given the following series:
+ | metrics | values |
+ | ceph_osd_metadata{back_iface="",ceph_daemon="osd.0",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 |
+ | ceph_osd_metadata{back_iface="",ceph_daemon="osd.1",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 |
+ | ceph_osd_metadata{back_iface="",ceph_daemon="osd.2",cluster_addr="192.168.1.12",device_class="hdd",front_iface="",hostname="127.0.0.1",objectstore="bluestore",public_addr="192.168.1.12",ceph_version="ceph version 17.0.0-8967-g6932a4f702a (6932a4f702a0d557fc36df3ca7a3bca70de42667) quincy (dev)"} | 1.0 |
+ When variable `ceph_hosts` is `127.0.0.1`
+ Then Grafana panel `OSDs` with legend `EMPTY` shows:
+ | metrics | values |
+ | {} | 3 |
+
+# IOPS Panel - begin
+
+Scenario: "Test Disk IOPS - Writes - Several OSDs per device"
+ Given the following series:
+ | metrics | values |
+ | node_disk_writes_completed_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_writes_completed_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0 osd.1 osd.2",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.3 osd.4 osd.5",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) writes` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0 osd.1 osd.2", device="sda", instance="localhost"} | 1 |
+ | {ceph_daemon="osd.3 osd.4 osd.5", device="sdb", instance="localhost"} | 1 |
+
+Scenario: "Test Disk IOPS - Writes - Single OSD per device"
+ Given the following series:
+ | metrics | values |
+ | node_disk_writes_completed_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_writes_completed_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) writes` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 |
+ | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 |
+
+Scenario: "Test Disk IOPS - Reads - Several OSDs per device"
+ Given the following series:
+ | metrics | values |
+ | node_disk_reads_completed_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_reads_completed_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0 osd.1 osd.2",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.3 osd.4 osd.5",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) reads` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0 osd.1 osd.2", device="sda", instance="localhost"} | 1 |
+ | {ceph_daemon="osd.3 osd.4 osd.5", device="sdb", instance="localhost"} | 1 |
+
+Scenario: "Test Disk IOPS - Reads - Single OSD per device"
+ Given the following series:
+ | metrics | values |
+ | node_disk_reads_completed_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_reads_completed_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Disk IOPS` with legend `{{device}}({{ceph_daemon}}) reads` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 |
+ | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 |
+
+# IOPS Panel - end
+
+# Node disk bytes written/read panel - begin
+
+Scenario: "Test disk throughput - read"
+ Given the following series:
+ | metrics | values |
+ | node_disk_read_bytes_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_read_bytes_total{device="sdb",instance="localhost:9100"} | 100+600x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Throughput by Disk` with legend `{{device}}({{ceph_daemon}}) read` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 |
+ | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 10 |
+
+Scenario: "Test disk throughput - write"
+ Given the following series:
+ | metrics | values |
+ | node_disk_written_bytes_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_written_bytes_total{device="sdb",instance="localhost:9100"} | 100+600x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Throughput by Disk` with legend `{{device}}({{ceph_daemon}}) write` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 |
+ | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 10 |
+
+# Node disk bytes written/read panel - end
+
+Scenario: "Test $ceph_hosts Disk Latency panel"
+ Given the following series:
+ | metrics | values |
+ | node_disk_write_time_seconds_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_write_time_seconds_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_writes_completed_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_writes_completed_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_read_time_seconds_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_read_time_seconds_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_reads_completed_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_reads_completed_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Disk Latency` with legend `{{device}}({{ceph_daemon}})` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 1 |
+ | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 1 |
+
+Scenario: "Test $ceph_hosts Disk utilization"
+ Given the following series:
+ | metrics | values |
+ | node_disk_io_time_seconds_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_io_time_seconds_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `ceph_hosts` is `localhost`
+ Then Grafana panel `$ceph_hosts Disk utilization` with legend `{{device}}({{ceph_daemon}})` shows:
+ | metrics | values |
+ | {ceph_daemon="osd.0", device="sda", instance="localhost"} | 100 |
+ | {ceph_daemon="osd.1", device="sdb", instance="localhost"} | 100 |
+
Then Grafana panel `Network Load` with legend `EMPTY` shows:
| metrics | values |
| {} | 6 |
+
+Scenario: "Test AVG Disk Utilization"
+ Given the following series:
+ | metrics | values |
+ | node_disk_io_time_seconds_total{device="sda",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_io_time_seconds_total{device="sdb",instance="localhost:9100"} | 10+60x1 |
+ | node_disk_io_time_seconds_total{device="sdc",instance="localhost:9100"} | 10 2000 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd_hosts` is `localhost`
+ Then Grafana panel `AVG Disk Utilization` with legend `EMPTY` shows:
+ | metrics | values |
+ | {} | 100 |
--- /dev/null
+Feature: OSD device details
+
+Scenario: "Test Physical Device Latency for $osd - Reads"
+ Given the following series:
+ | metrics | values |
+ | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 60 |
+ | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 60 |
+ | node_disk_read_time_seconds_total{device="sda",instance="localhost"} | 100 600 |
+ | node_disk_read_time_seconds_total{device="sdb",instance="localhost"} | 100 600 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd` is `osd.0`
+ Then Grafana panel `Physical Device Latency for $osd` with legend `{{instance}}/{{device}} Reads` shows:
+ | metrics | values |
+ | {device="sda",instance="localhost"} | 10 |
+
+Scenario: "Test Physical Device Latency for $osd - Writes"
+ Given the following series:
+ | metrics | values |
+ | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 60 |
+ | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 60 |
+ | node_disk_write_time_seconds_total{device="sda",instance="localhost"} | 100 600 |
+ | node_disk_write_time_seconds_total{device="sdb",instance="localhost"} | 100 600 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd` is `osd.0`
+ Then Grafana panel `Physical Device Latency for $osd` with legend `{{instance}}/{{device}} Writes` shows:
+ | metrics | values |
+ | {device="sda",instance="localhost"} | 10 |
+
+Scenario: "Test Physical Device R/W IOPS for $osd - Writes"
+ Given the following series:
+ | metrics | values |
+ | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 100 |
+ | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 100 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd` is `osd.0`
+ Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Writes` shows:
+ | metrics | values |
+ | {device="sda",instance="localhost"} | 1.5 |
+
+Scenario: "Test Physical Device R/W IOPS for $osd - Reads"
+ Given the following series:
+ | metrics | values |
+ | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 100 |
+ | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 100 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd` is `osd.0`
+ Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Reads` shows:
+ | metrics | values |
+ | {device="sda",instance="localhost"} | 1.5 |
+
+Scenario: "Test Physical Device R/W Bytes for $osd - Reads"
+ Given the following series:
+ | metrics | values |
+ | node_disk_reads_completed_total{device="sda",instance="localhost"} | 10 100 |
+ | node_disk_reads_completed_total{device="sdb",instance="localhost"} | 10 100 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd` is `osd.0`
+ Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Reads` shows:
+ | metrics | values |
+ | {device="sda",instance="localhost"} | 1.5 |
+
+Scenario: "Test Physical Device R/W Bytes for $osd - Writes"
+ Given the following series:
+ | metrics | values |
+ | node_disk_writes_completed_total{device="sda",instance="localhost"} | 10 100 |
+ | node_disk_writes_completed_total{device="sdb",instance="localhost"} | 10 100 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd` is `osd.0`
+ Then Grafana panel `Physical Device R/W IOPS for $osd` with legend `{{device}} on {{instance}} Writes` shows:
+ | metrics | values |
+ | {device="sda",instance="localhost"} | 1.5 |
+
+Scenario: "Test Physical Device Util% for $osd"
+ Given the following series:
+ | metrics | values |
+ | node_disk_io_time_seconds_total{device="sda",instance="localhost:9100"} | 10 100 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.0",device="/dev/sda",instance="localhost:9283"} | 1.0 |
+ | ceph_disk_occupation_human{ceph_daemon="osd.1",device="/dev/sdb",instance="localhost:9283"} | 1.0 |
+ When variable `osd` is `osd.0`
+ Then Grafana panel `Physical Device Util% for $osd` with legend `{{device}} on {{instance}}` shows:
+ | metrics | values |
+ | {device="sda",instance="localhost"} | 1.5 |
from collections import namedtuple
import yaml
-from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List
+from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable
LabelValues = Tuple[str, ...]
Number = Union[int, float]
+MetricValue = Dict[LabelValues, Number]
# Defaults for the Prometheus HTTP server. Can also set in config-key
# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
)
return expfmt
+ def group_by(
+ self,
+ keys: List[str],
+ joins: Dict[str, Callable[[List[str]], str]],
+ name: Optional[str] = None,
+ ) -> "Metric":
+ """
+ Groups data by label names.
+
+ Label names not passed are being removed from the resulting metric but
+ by providing a join function, labels of metrics can be grouped.
+
+ The purpose of this method is to provide a version of a metric that can
+ be used in matching where otherwise multiple results would be returned.
+
+ As grouping is possible in Prometheus, the only additional value of this
+ method is the possibility to join labels when grouping. For that reason,
+ passing joins is required. Please use PromQL expressions in all other
+ cases.
+
+ >>> m = Metric('type', 'name', '', labels=('label1', 'id'))
+ >>> m.value = {
+ ... ('foo', 'x'): 1,
+ ... ('foo', 'y'): 1,
+ ... }
+ >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value
+ {('foo', 'x,y'): 1}
+
+ The functionality of group by could roughly be compared with Prometheus'
+
+ group (ceph_disk_occupation) by (device, instance)
+
+ with the exception that not all labels which aren't used as a condition
+ to group a metric are discarded, but their values can are joined and the
+ label is thereby preserved.
+
+ This function takes the value of the first entry of a found group to be
+ used for the resulting value of the grouping operation.
+
+ >>> m = Metric('type', 'name', '', labels=('label1', 'id'))
+ >>> m.value = {
+ ... ('foo', 'x'): 555,
+ ... ('foo', 'y'): 10,
+ ... }
+ >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value
+ {('foo', 'x,y'): 555}
+ """
+ assert self.labelnames, "cannot match keys without label names"
+ for key in keys:
+ assert key in self.labelnames, "unknown key: {}".format(key)
+ assert joins, "joins must not be empty"
+ assert all(callable(c) for c in joins.values()), "joins must be callable"
+
+ # group
+ grouped: Dict[LabelValues, List[Tuple[Dict[str, str], Number]]] = defaultdict(list)
+ for label_values, metric_value in self.value.items():
+ labels = dict(zip(self.labelnames, label_values))
+ if not all(k in labels for k in keys):
+ continue
+ group_key = tuple(labels[k] for k in keys)
+ grouped[group_key].append((labels, metric_value))
+
+ # as there is nothing specified on how to join labels that are not equal
+ # and Prometheus `group` aggregation functions similarly, we simply drop
+ # those labels.
+ labelnames = tuple(
+ label for label in self.labelnames if label in keys or label in joins
+ )
+ superfluous_labelnames = [
+ label for label in self.labelnames if label not in labelnames
+ ]
+
+ # iterate and convert groups with more than one member into a single
+ # entry
+ values: MetricValue = {}
+ for group in grouped.values():
+ labels, metric_value = group[0]
+
+ for label in superfluous_labelnames:
+ del labels[label]
+
+ if len(group) > 1:
+ for key, fn in joins.items():
+ labels[key] = fn(list(labels[key] for labels, _ in group))
+
+ values[tuple(labels.values())] = metric_value
+
+ new_metric = Metric(self.mtype, name if name else self.name, self.desc, labelnames)
+ new_metric.value = values
+
+ return new_metric
+
class MetricCounter(Metric):
def __init__(self,
DISK_OCCUPATION
)
+ metrics['disk_occupation_human'] = Metric(
+ 'untyped',
+ 'disk_occupation_human',
+ 'Associate Ceph daemon with disk used for displaying to humans,'
+ ' not for joining tables (vector matching)',
+ DISK_OCCUPATION, # label names are automatically decimated on grouping
+ )
+
metrics['pool_metadata'] = Metric(
'untyped',
'pool_metadata',
self.log.info("Missing dev node metadata for osd {0}, skipping "
"occupation record for this osd".format(id_))
+ if 'disk_occupation' in self.metrics:
+ try:
+ self.metrics['disk_occupation_human'] = \
+ self.metrics['disk_occupation'].group_by(
+ ['device', 'instance'],
+ {'ceph_daemon': lambda daemons: ', '.join(daemons)},
+ name='disk_occupation_human',
+ )
+ except Exception as e:
+ self.log.error(e)
+
ec_profiles = osd_map.get('erasure_code_profiles', {})
def _get_pool_info(pool: Dict[str, Any]) -> Tuple[str, str]:
--- /dev/null
+from typing import Dict
+from unittest import TestCase
+
+from prometheus.module import Metric, LabelValues, Number
+
+
+class MetricGroupTest(TestCase):
+ def setUp(self):
+ self.DISK_OCCUPATION = (
+ "ceph_daemon",
+ "device",
+ "db_device",
+ "wal_device",
+ "instance",
+ )
+ self.metric_value: Dict[LabelValues, Number] = {
+ ("osd.0", "/dev/dm-0", "", "", "node1"): 1,
+ ("osd.1", "/dev/dm-0", "", "", "node3"): 1,
+ ("osd.2", "/dev/dm-0", "", "", "node2"): 1,
+ ("osd.3", "/dev/dm-1", "", "", "node1"): 1,
+ ("osd.4", "/dev/dm-1", "", "", "node3"): 1,
+ ("osd.5", "/dev/dm-1", "", "", "node2"): 1,
+ ("osd.6", "/dev/dm-1", "", "", "node2"): 1,
+ }
+
+ def test_metric_group_by(self):
+ m = Metric("untyped", "disk_occupation", "", self.DISK_OCCUPATION)
+ m.value = self.metric_value
+ grouped_metric = m.group_by(
+ ["device", "instance"],
+ {"ceph_daemon": lambda xs: "+".join(xs)},
+ name="disk_occupation_display",
+ )
+ self.assertEqual(
+ grouped_metric.value,
+ {
+ ("osd.0", "/dev/dm-0", "node1"): 1,
+ ("osd.1", "/dev/dm-0", "node3"): 1,
+ ("osd.2", "/dev/dm-0", "node2"): 1,
+ ("osd.3", "/dev/dm-1", "node1"): 1,
+ ("osd.4", "/dev/dm-1", "node3"): 1,
+ ("osd.5+osd.6", "/dev/dm-1", "node2"): 1,
+ },
+ )
+ self.maxDiff = None
+ self.assertEqual(
+ grouped_metric.str_expfmt(),
+ """
+# HELP ceph_disk_occupation_display
+# TYPE ceph_disk_occupation_display untyped
+ceph_disk_occupation_display{ceph_daemon="osd.0",device="/dev/dm-0",instance="node1"} 1.0
+ceph_disk_occupation_display{ceph_daemon="osd.1",device="/dev/dm-0",instance="node3"} 1.0
+ceph_disk_occupation_display{ceph_daemon="osd.2",device="/dev/dm-0",instance="node2"} 1.0
+ceph_disk_occupation_display{ceph_daemon="osd.3",device="/dev/dm-1",instance="node1"} 1.0
+ceph_disk_occupation_display{ceph_daemon="osd.4",device="/dev/dm-1",instance="node3"} 1.0
+ceph_disk_occupation_display{ceph_daemon="osd.5+osd.6",device="/dev/dm-1",instance="node2"} 1.0""", # noqa: W291
+ )
+ self.assertEqual(
+ grouped_metric.labelnames, ("ceph_daemon", "device", "instance")
+ )
+
+ def test_metric_group_by__no_value(self):
+ m = Metric("metric_type", "name", "desc", labels=('foo', 'bar'))
+ grouped = m.group_by(['foo'], {'bar': lambda bars: ', '.join(bars)})
+ self.assertEqual(grouped.value, {})
+ self.assertEqual(grouped.str_expfmt(),
+ '\n# HELP ceph_name desc\n# TYPE ceph_name metric_type')
+
+ def test_metric_group_by__no_labels(self):
+ m = Metric("metric_type", "name", "desc", labels=None)
+ with self.assertRaises(AssertionError) as cm:
+ m.group_by([], {})
+ self.assertEqual(str(cm.exception), "cannot match keys without label names")
+
+ def test_metric_group_by__key_not_in_labels(self):
+ m = Metric("metric_type", "name", "desc", labels=("foo", "bar"))
+ m.value = self.metric_value
+ with self.assertRaises(AssertionError) as cm:
+ m.group_by(["baz"], {})
+ self.assertEqual(str(cm.exception), "unknown key: baz")
+
+ def test_metric_group_by__empty_joins(self):
+ m = Metric("", "", "", ("foo", "bar"))
+ with self.assertRaises(AssertionError) as cm:
+ m.group_by(["foo"], joins={})
+ self.assertEqual(str(cm.exception), "joins must not be empty")
+
+ def test_metric_group_by__joins_not_callable(self):
+ m = Metric("", "", "", ("foo", "bar"))
+ m.value = self.metric_value
+ with self.assertRaises(AssertionError) as cm:
+ m.group_by(["foo"], {"bar": "not callable str"})
+ self.assertEqual(str(cm.exception), "joins must be callable")